In [16]:
import pandas as pd
import numpy as np
import math
import json

from sklearn.pipeline import Pipeline, make_union

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import make_scorer,r2_score, mean_squared_error, f1_score, classification_report, accuracy_score

from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor, RandomForestClassifier, RandomForestRegressor, \
GradientBoostingClassifier, GradientBoostingRegressor, ExtraTreesClassifier, ExtraTreesRegressor

from sklearn.multioutput import MultiOutputClassifier


In [11]:
transaction = pd.read_csv('transaction_final1.csv')
transaction.drop(columns= ['Unnamed: 0'], axis =1)

Unnamed: 0,age,income,gender_F,gender_M,gender_O,became_member_on_year,became_member_on_month,became_member_on_date,difficulty,duration,...,offer_code_1,offer_code_2,offer_code_3,offer_code_4,offer_code_5,offer_code_6,offer_code_7,offer_code_8,offer_code_9,offer_code_10
0,33,72000.0,0,1,0,2017,4,21,10,7,...,0,0,0,0,0,0,0,0,0,1
1,33,72000.0,0,1,0,2017,4,21,0,4,...,0,0,1,0,0,0,0,0,0,0
2,33,72000.0,0,1,0,2017,4,21,0,3,...,0,0,0,0,0,0,0,1,0,0
3,33,72000.0,0,1,0,2017,4,21,5,5,...,0,0,0,0,0,0,0,0,1,0
4,33,72000.0,0,1,0,2017,4,21,10,10,...,0,0,0,0,0,0,1,0,0,0
5,40,57000.0,0,0,1,2018,1,9,20,10,...,0,0,0,0,1,0,0,0,0,0
6,40,57000.0,0,0,1,2018,1,9,7,7,...,0,0,0,0,0,1,0,0,0,0
7,40,57000.0,0,0,1,2018,1,9,0,4,...,0,0,1,0,0,0,0,0,0,0
8,40,57000.0,0,0,1,2018,1,9,0,3,...,0,0,0,0,0,0,0,1,0,0
9,40,57000.0,0,0,1,2018,1,9,5,7,...,0,0,0,1,0,0,0,0,0,0


# predicts whether or not someone will respond to an offer.

In [9]:
features = transaction.columns.drop(['Unnamed: 0','age', 'income', 'gender_F', 'gender_M', 'gender_O','became_member_on_year', 
                                     'became_member_on_month','became_member_on_date','duration', 'bogo', 'discount',
                                     'informational', 'email', 'mobile', 'social', 'web','influenced',
                                     'offer_code_1', 'offer_code_2','offer_code_3', 'offer_code_4', 'offer_code_5',
                                     'offer_code_6','offer_code_7', 'offer_code_8', 'offer_code_9', 'offer_code_10'])

In [10]:
features

Index(['difficulty', 'reward', 'amount'], dtype='object')

In [12]:
X = transaction[features]
y = transaction['influenced']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
transformer = make_union(StandardScaler())

In [18]:
clf = RandomForestClassifier()

pipeline = Pipeline([
    ('transformer',transformer),
    ('classifier',clf)
])

parameters = [
    {
        "classifier__n_estimators": range(10,110,10)
    },
    {
        "classifier": [AdaBoostClassifier()],
        "classifier__n_estimators": range(10,110,10),
        "classifier__learning_rate":np.linspace(0.1,2.5,20)
    },
    {
        "classifier": [ExtraTreesClassifier()],
        "classifier__n_estimators": range(10,110,10)
    },
    {
        "classifier": [GradientBoostingClassifier()],
        "classifier__n_estimators": range(10,110,10),
        "classifier__learning_rate":np.linspace(0.1,2.5,20)
    }
]

In [19]:
clf = AdaBoostClassifier()

pipeline = Pipeline([
    ('classifier',clf)
])

parameters = [
    {
        "classifier__n_estimators": [10],
        "classifier__learning_rate":[1.8684210526315792]
    }
]

In [20]:
scoring = make_scorer(f1_score)

# Change n_jobs to -1 if you're running more than or less than 8 core cpu.
gridSearch = GridSearchCV(pipeline,
                          parameters,
                          verbose=2,
                          n_jobs = 6,
#                          n_jobs = -1,
                          cv = 5,
                          scoring=scoring,
                          return_train_score=True)

In [25]:
influnce_clf = gridSearch.fit(X_train,y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   3 out of   5 | elapsed:    0.5s remaining:    0.4s
[Parallel(n_jobs=6)]: Done   5 out of   5 | elapsed:    0.6s finished


In [26]:
y_pred = influnce_clf.predict(X_test)

In [27]:
y_train_pred = influnce_clf.predict(X_train)

In [28]:
print(classification_report(y_true=y_train,y_pred= y_train_pred))

              precision    recall  f1-score   support

           0       0.84      1.00      0.91     36883
           1       0.50      0.00      0.00      7134

   micro avg       0.84      0.84      0.84     44017
   macro avg       0.67      0.50      0.46     44017
weighted avg       0.78      0.84      0.76     44017



In [29]:
print(classification_report(y_true=y_test,y_pred= y_pred))

              precision    recall  f1-score   support

           0       0.84      1.00      0.91      9243
           1       0.00      0.00      0.00      1762

   micro avg       0.84      0.84      0.84     11005
   macro avg       0.42      0.50      0.46     11005
weighted avg       0.71      0.84      0.77     11005



  'precision', 'predicted', average, warn_for)


In [30]:
accuracy_score(y_true=y_train,y_pred= y_train_pred), accuracy_score(y_true=y_test,y_pred= y_pred)

(0.8379262557648182, 0.8398909586551567)

In [31]:
f1_score(y_true=y_train,y_pred= y_train_pred), f1_score(y_true=y_test,y_pred= y_pred)

  'precision', 'predicted', average, warn_for)


(0.0008403361344537815, 0.0)

# predicts purchasing habits.


In [32]:
features = transaction.columns.drop(['amount','influenced'])

In [33]:
X = transaction[features]
y = transaction['amount']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
transformer = make_union(StandardScaler())

In [36]:
clf = GradientBoostingRegressor()

pipeline = Pipeline([
    ('transformer',transformer),
    ('classifier',clf)
])

parameters = [
    {
        "classifier__n_estimators": range(90,130,10),
        "classifier__learning_rate":[0.1]
    }
]

In [37]:
scoring = make_scorer(r2_score)

gridSearch = GridSearchCV(pipeline,
                          parameters,
                          verbose=2,
                          n_jobs = 6,
#                          n_jobs = -1,
                          cv = 5,
                          scoring=scoring,
#                          refit='F1',
                          return_train_score=True)

In [38]:
amount_clf = gridSearch.fit(X_train,y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 out of  20 | elapsed:  1.0min remaining:    0.0s
[Parallel(n_jobs=6)]: Done  20 out of  20 | elapsed:  1.0min finished
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


In [39]:
y_pred = amount_clf.predict(X_test)

  res = transformer.transform(X)


In [40]:
y_train_pred = amount_clf.predict(X_train)

  res = transformer.transform(X)


In [41]:
r2_score(y_true=y_test,y_pred=y_pred), r2_score(y_true=y_train,y_pred=y_train_pred)

(0.07688604475321359, 0.11376718790226481)

In [42]:
mean_squared_error(y_true=y_test,y_pred=y_pred),mean_squared_error(y_true=y_train,y_pred=y_train_pred)

(855.6485097422292, 910.6372690476384)