In [1]:
import numpy as np, matplotlib as mpl, matplotlib.pyplot as plt, pandas as pd
import seaborn as sns, math, os, warnings
from azureml.core import Dataset

# Подключаемся к workspace и скачиваем датасет

In [2]:
import azureml.core
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core import Experiment, Workspace

# Check core SDK version number
print("You are currently using version", azureml.core.VERSION, "of the Azure ML SDK")
print("")

# Log In to Azure ML Workspace
interactive_auth = InteractiveLoginAuthentication(tenant_id="76f90eb1-fb9a-4446-9875-4d323d6455ad")

ws = Workspace.from_config(auth=interactive_auth)
print('Workspace name: ' + ws.name, sep='\n')

You are currently using version 1.8.0 of the Azure ML SDK

Workspace name: team25


In [3]:
aml_dataset = Dataset.get_by_name(ws, 'train_ds', version='latest')
pdf = aml_dataset.to_pandas_dataframe()
df_data = pdf.copy()



# Подготавливаем данные

In [4]:
df_data = df_data.rename(columns={'response_att': 'target'})
# Rename & Label encode treatment column
df_data = df_data.rename(columns={'group': 'treatment'})
df_data.treatment = df_data.treatment.map({'control': 0, 'test': 1})

In [5]:
def declare_tc(df:pd.DataFrame):
    """Declare target class
    """
    #CN:
    df['target_class'] = 0 
    #CR:
    df.loc[(df.treatment == 0) & (df.target != 0),'target_class'] = 1 
    #TN:
    df.loc[(df.treatment != 0) & (df.target == 0),'target_class'] = 2 
    #TR:
    df.loc[(df.treatment != 0) & (df.target != 0),'target_class'] = 3 
    return df

In [6]:
df_data.gender = [0 if x == 'Ж' else 1 for x in df_data.gender]

In [7]:
df_data = declare_tc(df_data)

# Функции для разбиения и тестирования train/test

In [20]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
import lightgbm as lgb
def uplift_split(df_data:pd.DataFrame):
    """Train-Test Split
    """
    X = df_data.drop(['target','target_class'], axis=1)
    y = df_data.target_class
    X_train, X_test, \
    y_train, y_test  = train_test_split(df_data.drop(['target_class'],axis=1),
                                       y,
                                       test_size=0.2,
                                       random_state=42)
    return X_train, X_test, y_train, y_test

In [67]:
def uplift_model(X_train:pd.DataFrame,
                 X_test:pd.DataFrame,
                 y_train:pd.DataFrame,
                 y_test:pd.DataFrame,
                lqb_params):
    """Using XGB to get the uplift score
    """
    # Create new dataframe
    result = pd.DataFrame(X_test).copy()    
    # Fit the model
    uplift_model \
    = lgb.LGBMClassifier(**lqb_params).fit(
        X_train.drop(['treatment', 'target'],axis=1), y_train,
        eval_set=[(X_test.drop(['treatment', 'target'],axis=1), y_test)],
        verbose=50,
        early_stopping_rounds=50,
        eval_metric="logloss"
    )
    
    
    # Predict using test-data
    uplift_proba \
    = uplift_model.predict_proba(X_test.drop(['treatment', 'target'], axis=1))
    result['proba_CN'] = uplift_proba[:,0] 
    result['proba_CR'] = uplift_proba[:,1] 
    result['proba_TN'] = uplift_proba[:,2] 
    result['proba_TR'] = uplift_proba[:,3]
    result['uplift'] = result.eval('\
    proba_CN/(proba_CN+proba_CR) \
    + proba_TR/(proba_TN+proba_TR) \
    - proba_TN/(proba_TN+proba_TR) \
    - proba_CR/(proba_CN+proba_CR)')  
    # Put the result 
    result['target_class'] = y_test
    result['target'] = X_test['target']
    return result

In [68]:
def uplift(df_data:pd.DataFrame, lqb_params):
    """Combine the split and Modeling function
    """
    X_train, X_test, y_train, y_test = uplift_split(df_data)
    result = uplift_model(X_train, X_test, y_train, y_test, lqb_params)
    return result

In [23]:
res = uplift(df_data)

In [None]:
res.head()

# Hyperopt

In [72]:
from hyperopt import hp

lgbm_hp_hyper_space = {
    "boosting_type": hp.choice("boosting_type", ["gbdt", "dart"]),
    "objective": hp.choice("objective", "binary"),
    "n_estimators": 600,
    "learning_rate": hp.uniform("learning_rate", 0.01, 0.1),
    'subsample': hp.uniform('subsample', 0.2, 0.7),
    'feature_fraction': hp.uniform('feature_fraction', 0.2, 0.7),
    "num_leaves": 4 + hp.randint("num_leaves", 28),
    "max_depth": hp.choice("max_depth", [2, 3, 4, 5]),
    "n_jobs": 30
}

In [73]:
from hyperopt import fmin, tpe, Trials, space_eval
from functools import partial

trials = Trials()
algo = partial(
    tpe.suggest,
    n_startup_jobs=10,
    gamma=0.25,
    n_EI_candidates=24
)

In [None]:
def optimization_function(params):
    print(params)
    res = uplift(df_data, params)
    score = custom_metric(res)
    print(f"score = {score}")
    
    return -score
    

best = fmin(
    optimization_function,
    space=lgbm_hp_hyper_space,
    algo=algo,
    max_evals=50,
    trials=trials,
    verbose=1
)

{'boosting_type': 'dart', 'feature_fraction': 0.5820290153544719, 'learning_rate': 0.010416083794593927, 'max_depth': 4, 'n_estimators': 600, 'n_jobs': 30, 'num_leaves': 18, 'objective': 'b', 'subsample': 0.6959176517648378}
[50]	valid_0's multi_logloss: 0.885496                         
[100]	valid_0's multi_logloss: 0.938396                       
[150]	valid_0's multi_logloss: 0.97406                        
[200]	valid_0's multi_logloss: 0.97516                        
[250]	valid_0's multi_logloss: 0.966778                       
[300]	valid_0's multi_logloss: 0.946903                       
[350]	valid_0's multi_logloss: 0.941444                       
[400]	valid_0's multi_logloss: 0.920278                       
[450]	valid_0's multi_logloss: 0.907478                       
[500]	valid_0's multi_logloss: 0.898386                       
[550]	valid_0's multi_logloss: 0.890855                       
[600]	valid_0's multi_logloss: 0.890633                       
score = 5.94476807

In [25]:
def custom_metric(answers, take_top_ratio=0.25):
    answers.sort_values(by='uplift', inplace=True, ascending=False)
    n_samples = int(np.ceil(answers.shape[0] * take_top_ratio))
    answers = answers.iloc[:n_samples, :]
    answers_test = answers[answers['treatment'] == 1]['target'].sum() / \
                   answers[answers['treatment'] == 1].shape[0]
    answers_control = answers[answers['treatment'] == 0]['target'].sum() / \
                      answers[answers['treatment'] == 0].shape[0]
    return (answers_test - answers_control) * 100

# Final pipeline

In [28]:
X = df_data.drop(['target_class'], axis=1)
y = df_data.target_class

In [30]:
uplift_model \
    = lgb.LGBMClassifier().fit(X.drop(['treatment', 'target'],axis=1), y)

# Загрузка тест выборки

In [31]:
aml_dataset = Dataset.get_by_name(ws, 'test_ds', version='latest')
df_test = aml_dataset.to_pandas_dataframe()



In [None]:
df_test.gender = [0 if x == 'Ж' else 1 for x in df_test.gender]

In [32]:
result = pd.DataFrame(df_test).copy() 

In [54]:
uplift_proba \
    = uplift_model.predict_proba(df_test)
result['proba_CN'] = uplift_proba[:,0] 
result['proba_CR'] = uplift_proba[:,1] 
result['proba_TN'] = uplift_proba[:,2] 
result['proba_TR'] = uplift_proba[:,3]
result['uplift'] = result.eval('\
proba_CN/(proba_CN+proba_CR) \
+ proba_TR/(proba_TN+proba_TR) \
- proba_TN/(proba_TN+proba_TR) \
- proba_CR/(proba_CN+proba_CR)')  
# Put the result 
#result['target_class'] = y_test
#result['target'] = X_test['target']

In [39]:
result.head()

Unnamed: 0,CardHolder,age,cheque_count_12m_g20,cheque_count_12m_g21,cheque_count_12m_g25,cheque_count_12m_g32,cheque_count_12m_g33,cheque_count_12m_g38,cheque_count_12m_g39,cheque_count_12m_g41,...,sale_sum_6m_g44,sale_sum_6m_g54,stdev_days_between_visits_15d,stdev_discount_depth_15d,stdev_discount_depth_1m,proba_CN,proba_CR,proba_TN,proba_TR,uplift
0,16400802,26.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,5.0,...,192.76,32.17,2.8868,0.3266,0.3699,0.186164,0.055891,0.553912,0.204033,0.076579
1,15752880,73.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,,,0.0,0.0,0.0,0.216491,0.003769,0.768937,0.010802,-0.006513
2,15978290,32.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,3.0,...,0.0,41.99,0.0,0.0,0.0,0.227581,0.00787,0.735668,0.028882,0.008702
3,16604118,24.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,2.0,...,388.23,721.33,0.0,,,0.217346,0.018284,0.696829,0.06754,0.021526
4,15880709,42.0,0.0,0.0,1.0,1.0,3.0,1.0,2.0,0.0,...,336.49,306.66,0.0,0.3627,0.2688,0.211381,0.021493,0.694263,0.072863,0.005376


In [55]:
submission = df_test[['CardHolder']]

In [56]:
submission['uplift'] = result['uplift']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [57]:
submission.head()

Unnamed: 0,CardHolder,uplift
0,16400802,0.076579
1,15752880,-0.006513
2,15978290,0.008702
3,16604118,0.021526
4,15880709,0.005376


In [58]:
submission.to_csv('4submission.csv', index=False, sep=';')