In [1]:
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_csv('../../data/mini.csv')
train_df['occurence'] = train_df['occurence'].apply(pd.to_datetime)


In [28]:
hz = pd.DataFrame(train_df.groupby('customer_id')['cost'].mean())
hz['number_of_transactions'] = train_df.groupby('customer_id')['cost'].count()
hz['total'] = train_df.groupby('customer_id')['cost'].sum()

In [32]:
hz['number_of_transactions'].

customer_id
1          38
5           1
37        100
53          3
54         20
153        11
154         4
155         5
156        23
158        13
160         2
163         3
164        29
165        32
167        14
170        39
172         9
175        42
176         2
186        80
189         4
191        43
192        27
193        12
195        15
196        28
197        76
198        30
200        62
201        42
         ... 
627653      4
627656      1
627659      1
627660      1
627662      2
627663      2
627665      3
627667      1
627668      1
627670      1
627671      4
627673      3
627676      1
627678      1
627681      1
627682      1
627684      9
627686      2
627688      1
627690      1
627691      1
627693      4
627704      1
627708      1
627711      1
627716      2
627736      1
627738      1
627740      3
627746      1
Name: number_of_transactions, Length: 245676, dtype: int64

In [3]:
train_df['year'] = train_df['occurence'].apply(lambda st: st.year)
train_df['month'] = train_df['occurence'].apply(lambda st: st.month)

In [4]:
train_df.drop('occurence', axis=1, inplace=True)

In [5]:
train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1990712 entries, 0 to 1990711
Data columns (total 5 columns):
customer_id    int64
cost           float64
item_id        object
year           int64
month          int64
dtypes: float64(1), int64(3), object(1)
memory usage: 75.9+ MB


In [6]:
X_train = train_df[(train_df['year']==2018) & (train_df['month']<=6)]

In [8]:
X_train.head()

Unnamed: 0,customer_id,cost,item_id,year,month
1,13891,1090.0,828115,2018,2
4,58500,150.0,518554,2018,3
5,572181,990.0,656673,2018,5
7,114804,3990.0,958574,2018,5
16,568563,1050.0,969194,2018,6


In [10]:
five_month = X_train[X_train['month']<=5]

In [11]:
target = X_train[X_train['month']==6]

In [12]:
target.head()

Unnamed: 0,customer_id,cost,item_id,year,month
16,568563,1050.0,969194,2018,6
83,589411,2490.0,977771,2018,6
88,11551,850.0,689153,2018,6
93,564748,4499.0,956799,2018,6
94,106202,8990.0,961388,2018,6


In [13]:
five_month.head()

Unnamed: 0,customer_id,cost,item_id,year,month
1,13891,1090.0,828115,2018,2
4,58500,150.0,518554,2018,3
5,572181,990.0,656673,2018,5
7,114804,3990.0,958574,2018,5
17,422260,1100.0,451858,2018,1


In [14]:
five_month= five_month[['customer_id']].drop_duplicates(subset=['customer_id'], keep=False)

In [15]:
target = target[['customer_id']].drop_duplicates(subset=['customer_id'], keep=False)

In [16]:
five_month.shape, target.shape

((46059, 1), (26205, 1))

In [17]:
target['target'] = 'Yes'

In [18]:
target.head()

Unnamed: 0,customer_id,target
93,564748,Yes
116,307543,Yes
225,297809,Yes
266,457268,Yes
336,537722,Yes


In [19]:
result = pd.merge(five_month,target,how = 'left', on = ['customer_id'])

In [20]:
result['target'] = (result['target'].isnull()).astype('int')

In [23]:
result = pd.merge(result, hz, how = 'left', on = ['customer_id'])

KeyError: 'customer_id'

In [216]:
y  = result['target']

In [235]:
X = result.drop('target', axis=1)
X.shape

(46059, 1)

In [237]:
tree = DecisionTreeClassifier(random_state=17)


In [238]:
tree_params = {'max_depth': list(range(1,20,4)),'min_samples_leaf': list(range(1, 20,4))}
X_tr, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [239]:
tree_grid = GridSearchCV(tree, tree_params, cv=5, n_jobs=-1)
tree_grid.fit(X_tr, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [1, 5, 9, 13, 17], 'min_samples_leaf': [1, 5, 9, 13, 17]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [240]:
X_tr.shape, y_train.shape

((30859, 1), (30859,))

In [241]:
np.mean(cross_val_score(tree_grid, X_tr, y_train, cv=5))

0.9286755942205772

In [242]:
tree_valid_pred = tree_grid.predict(X_test)

In [245]:
accuracy_score(y_test, tree_valid_pred)

0.9323684210526316