In [97]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [98]:
data = pd.read_csv('../data/mini.csv')

In [99]:
data.head()

Unnamed: 0,customer_id,occurence,cost,item_id
0,416705,2017-05-07 21:58:10.000000,299.0,515274
1,13891,2018-02-10 17:35:11.000000,1090.0,828115
2,9081,2017-12-21 17:13:44.000000,499.0,695501
3,470904,2017-10-31 10:39:49.000000,290.0,899821
4,58500,2018-03-09 20:57:29.000000,150.0,518554


In [100]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1990712 entries, 0 to 1990711
Data columns (total 4 columns):
customer_id    int64
occurence      object
cost           float64
item_id        object
dtypes: float64(1), int64(1), object(2)
memory usage: 60.8+ MB


In [101]:
data['occurence'] = pd.to_datetime(data['occurence'])

In [102]:
data['year'] = data['occurence'].dt.year
data['month'] = data.occurence.dt.month

In [103]:
data = data[(data['year'] == 2018) & (data['month'] <= 6)]

In [104]:
#data.drop('occurence', axis=1,inplace=True)
data.drop('year', axis=1, inplace=True)

In [105]:
data.head()

Unnamed: 0,customer_id,occurence,cost,item_id,month
1,13891,2018-02-10 17:35:11,1090.0,828115,2
4,58500,2018-03-09 20:57:29,150.0,518554,3
5,572181,2018-05-06 20:25:39,990.0,656673,5
7,114804,2018-05-06 13:39:20,3990.0,958574,5
16,568563,2018-06-21 00:57:26,1050.0,969194,6


In [93]:
df = pd.DataFrame(data.groupby('customer_id')['cost'].mean())
df['number_of_transcations'] = data.groupby('customer_id')['cost'].count()
df['total'] = data.groupby('customer_id')['cost'].sum()

In [111]:
five_months = data[data['month']<=5]
target_month = data[data['month']==6]

In [123]:
target_month = target_month[['customer_id']].drop_duplicates(subset=['customer_id'], keep=False)
five_months = five_months[['customer_id']].drop_duplicates(subset=['customer_id'], keep=False)
target_month['target'] = 'yes'
result = pd.merge(five_months, target_month, how='left', on=['customer_id'])
result['target'] = (result.target.isnull()).astype('int')

In [143]:
total = pd.merge(result, df, how='inner', on=['customer_id'])
total.drop('number_of_transcations', axis=1, inplace=True)
total.drop('cost', axis=1, inplace=True)
total.drop('customer_id', axis=1, inplace=True)

In [144]:
y = total['target']
X = total.drop('target', axis=1)

In [145]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=17)

In [146]:
tree = DecisionTreeClassifier(random_state=17)

In [148]:
tree_params = {'max_depth': np.arange(1,11), 'max_features':[.5,.7,1]}
tree_grid = GridSearchCV(tree, tree_params, cv=5, n_jobs=-1)
tree_grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]), 'max_features': [0.5, 0.7, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [150]:
tree_grid.best_score_

0.9317018702893831

In [151]:
tree_valid_pred = tree_grid.predict(X_valid)
accuracy_score(y_valid, tree_valid_pred)

0.9256766536401795