In [1]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
import utils.feature_selections as fs
import utils.get_top_n_features as top_features
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import catboost
from utils import MonotonicBinning
from sklearn.model_selection import train_test_split

In [2]:
def drop_corr(df, tech_cols, cut_off):
    delColList = []
    df_ = df.sample(frac = 0.1)
    argList = list(df.columns)
    for i in tech_cols:
        argList.remove(i)
    while 1:
        if (len(argList) <= 1):
            break
        
        feature = argList[0]
        
        for i in argList[1:]:
            try:
                if abs(df_[[i, feature]].corr().iloc[0][1]) >= cut_off:
                    delColList.append(feature)
                    break
            except Exception:
                print("Не рассчиталась корреляция между фичами " + i + " и " + feature)
        argList.remove(feature)
    return delColList
                

In [3]:
def drop_singular(df, cut_off):
    delColList = []
    for col in df.columns:
        if max(df[col].value_counts() / df.shape[0]) > cut_off:
            delColList.append(col)
    return delColList

In [4]:
df_features = pd.read_csv('./data/FINAL_FEATURES_TRAINTEST.tsv', sep = '\t')

In [5]:
df_train = pd.read_csv('./data/clean_data/TRAIN_TARGETS_DATES.csv', sep = ',')

In [6]:
df_val = pd.read_csv('./data/clean_data/VAL_TARGETS_DATES.csv', sep = ',')

In [7]:
df_train = df_train.merge(df_features, how = 'left', left_on = 'CLIENT_ID', right_on = 'CLIENT_ID')

In [8]:
df_val = df_val.merge(df_features, how = 'left', left_on = 'CLIENT_ID', right_on = 'CLIENT_ID')

In [9]:
drop_list_singular = drop_singular(df_train, 0.90)

In [10]:
df_train.drop(drop_list_singular, axis = 1, inplace = True)

In [11]:
tech_cols = ['CLIENT_ID']


drop_list_corr = drop_corr(df_train, tech_cols, 0.75)

In [12]:
df_train.drop(drop_list_corr, axis = 1, inplace = True)

In [13]:
df_train = df_train.apply(lambda x: x.fillna(x.mean()),axis=0)

In [14]:
df_val = df_val.apply(lambda x: x.fillna(x.mean()),axis=0)

In [15]:
df_val = df_val[df_train.columns]

In [16]:
x_train = df_train.drop(['CLIENT_ID', 'TARGET', 'RETRO_DT'], axis = 1)
y_train = df_train['TARGET']

x_test = df_val.drop(['CLIENT_ID', 'TARGET', 'RETRO_DT'], axis = 1)
y_test = df_val['TARGET']

In [17]:
model = lgb.LGBMClassifier()

In [18]:
model.fit(x_train, y_train)

In [19]:
roc_auc_score(y_test, model.predict_proba(x_test)[:, 1])

0.5928618052712593

In [20]:
df_train['proba'] = model.predict_proba(x_train)[:, 1]

In [108]:
binner = MonotonicBinning.MonotonicBinning()
df_train['bucket'] = binner.fit_transform(df_train, target='TARGET', column='proba', maxbins = 100, stopping = 7)

In [109]:
df_train['bucket'].value_counts()

2    45531
3    29986
1    20013
4     4314
0     1312
5      678
6       96
7       16
Name: bucket, dtype: int64

In [110]:
df_train[['bucket', 'TARGET']].groupby('bucket').agg('sum')/df_train[['bucket', 'TARGET']].groupby('bucket').agg('count')

Unnamed: 0_level_0,TARGET
bucket,Unnamed: 1_level_1
0,0.002287
1,0.079648
2,0.239639
3,0.396685
4,0.671071
5,0.910029
6,1.0
7,1.0


In [111]:
df_train[df_train['bucket'] == 7]['TARGET'].nunique()

1

In [112]:
df_train['TARGET'].sum()/df_train.shape[0]

0.2749200557157711

In [119]:
df_train['pred'] = df_train['bucket'].apply(lambda x: 1 if x >= 3 else 0)

In [120]:
roc_auc_score(df_train['TARGET'], df_train['pred'])

0.6444766393161319

In [21]:
df_val['proba'] = model.predict_proba(x_test)[:, 1]

In [123]:
df_val['bucket'] = binner.transform(df_val, column='proba')

In [124]:
df_val['pred'] = df_val['bucket'].apply(lambda x: 1 if x >= 3 else 0)

In [125]:
roc_auc_score(df_val['TARGET'], df_val['pred'])

0.5606491674491277

In [None]:
binner = MonotonicBinning.MonotonicBinning()
for col in df_features.drop(['TARGET', 'CLIENT_ID'], axis = 1).columns:
    print(col)
    df_features[col] = binner.fit_transform(df_features, target='TARGET', column=col, maxbins = 20, stopping = 7)

In [22]:
df = pd.concat([df_train[['CLIENT_ID', 'proba']], df_val[['CLIENT_ID', 'proba']]])

In [23]:
df.shape

(127432, 2)

In [25]:
df = df.merge(df_features, how = 'left', left_on = 'CLIENT_ID', right_on = 'CLIENT_ID')

In [None]:
df.to_csv('classic_submission.csv', )

In [130]:
roc_auc_score(df_train['TARGET'], df_train['proba'])

0.7209615006505683

In [26]:
df = pd.concat([df_train, df_val[df_train.columns]])

In [27]:
df.shape

(127432, 219)

In [78]:
model.fit(df.drop(['TARGET', 'CLIENT_ID', 'RETRO_DT', 'proba'], axis = 1), df['TARGET'])

In [80]:
roc_auc_score(df['TARGET'], model.predict_proba(df.drop(['TARGET', 'CLIENT_ID', 'RETRO_DT', 'proba'], axis = 1))[:, 1])

0.7021170103840338

In [81]:
df['proba'] = model.predict_proba(df.drop(['TARGET', 'CLIENT_ID', 'RETRO_DT', 'proba'], axis = 1))[:, 1]

In [82]:
df[['CLIENT_ID', 'proba']]

Unnamed: 0,CLIENT_ID,proba
0,1580286,0.224819
1,68513,0.252503
2,1579276,0.297600
3,1531035,0.257584
4,1547532,0.220082
...,...,...
25481,69974,0.328844
25482,70728,0.287455
25483,70831,0.298865
25484,72181,0.332944


In [83]:
df[['CLIENT_ID', 'proba']].to_csv('classic_submission.csv', index = False)

In [84]:
df[['CLIENT_ID', 'proba']]

Unnamed: 0,CLIENT_ID,proba
0,1580286,0.224819
1,68513,0.252503
2,1579276,0.297600
3,1531035,0.257584
4,1547532,0.220082
...,...,...
25481,69974,0.328844
25482,70728,0.287455
25483,70831,0.298865
25484,72181,0.332944


In [85]:
df_friends = pd.read_csv('train_val_friends_submition.csv')

In [86]:
df_friends.rename(columns = {'TARGET': 'prob_friends'}, inplace = True)

In [87]:
df_friends = df_friends.merge(df[['CLIENT_ID', 'proba']], how = 'left', left_on = 'CLIENT_ID', right_on = 'CLIENT_ID')

In [88]:
df_friends

Unnamed: 0,CLIENT_ID,prob_friends,proba
0,1580286,0.184994,0.224819
1,68513,0.439955,0.252503
2,1579276,0.341905,0.297600
3,1531035,0.234297,0.257584
4,1547532,0.492014,0.220082
...,...,...,...
127427,60197,0.377049,0.313803
127428,60200,0.377049,0.310606
127429,69219,0.377049,0.337830
127430,69821,0.377049,0.393814


In [89]:
df_train.shape[0] + df_val.shape[0]

127432

In [90]:
df_features.shape

(159288, 1015)

In [91]:
df_targets = pd.read_csv('./data/FINAL_TARGETS_DATES_TRAINTEST.tsv', sep = '\t')

In [92]:
df_friends = df_friends.merge(df_targets, how = 'left', left_on = 'CLIENT_ID', right_on = 'CLIENT_ID')

In [93]:
df_friends.drop('RETRO_DT', axis = 1, inplace = True)

In [94]:
df_friends

Unnamed: 0,CLIENT_ID,prob_friends,proba,TARGET
0,1580286,0.184994,0.224819,0
1,68513,0.439955,0.252503,1
2,1579276,0.341905,0.297600,1
3,1531035,0.234297,0.257584,0
4,1547532,0.492014,0.220082,0
...,...,...,...,...
127427,60197,0.377049,0.313803,1
127428,60200,0.377049,0.310606,0
127429,69219,0.377049,0.337830,1
127430,69821,0.377049,0.393814,1


In [95]:
df_val.shape

(25486, 219)

In [96]:
x = df_friends.drop(['CLIENT_ID', 'TARGET'], axis = 1)
y = df_friends['TARGET']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25)

In [97]:
model_blend = LogisticRegression()
model_blend.fit(x_train, y_train)

In [98]:
roc_auc_score(y_train, model_blend.predict_proba(x_train)[:, 1])

0.91440518376026

In [99]:
roc_auc_score(y_test, model_blend.predict_proba(x_test)[:, 1])

0.9161909225164502

In [100]:
df_test = pd.read_csv('./data/clean_data/TEST_TARGETS_DATES.csv', sep = ',')

In [101]:
df_test = df_test.merge(df_features, how = 'left', left_on = 'CLIENT_ID', right_on = 'CLIENT_ID')

In [102]:
df_test.drop(['RETRO_DT', 'TARGET'], axis = 1, inplace = True)

In [104]:
df_test['proba']=model.predict_proba(df_test[model.feature_name_])[:, 1]

In [108]:
df_test[['CLIENT_ID', 'proba']]

Unnamed: 0,CLIENT_ID,proba
0,1025140,0.367665
1,1029732,0.331300
2,1079794,0.245140
3,1116331,0.253892
4,1136822,0.252879
...,...,...
31853,70294,0.425494
31854,71247,0.343590
31855,71657,0.251240
31856,72631,0.226346


In [109]:
test_friend = pd.read_csv('test_friends_submition.csv', sep = ',')

In [110]:
test_friend

Unnamed: 0,CLIENT_ID,TARGET
0,1025140,0.475807
1,1029732,0.556455
2,1079794,0.593034
3,1136822,0.020654
4,114541,0.140779
...,...,...
31853,53446,0.370026
31854,57109,0.370026
31855,57114,0.370026
31856,68831,0.370026


In [111]:
test_friend.rename(columns = {'TARGET': 'prob_friends'}, inplace = True)

In [112]:
test_friend = test_friend.merge(df_test[['CLIENT_ID', 'proba']], how = 'left', left_on = 'CLIENT_ID', right_on = 'CLIENT_ID')

In [119]:
df_subm = df_test[['CLIENT_ID']]
df_subm['TARGET'] = model_blend.predict_proba(test_friend.drop('CLIENT_ID', axis = 1))[:, 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subm['TARGET'] = model_blend.predict_proba(test_friend.drop('CLIENT_ID', axis = 1))[:, 1]


In [121]:
df_subm.to_csv('new-subm.csv', index = False)