In [20]:
import os
import gc
import time
import psutil
import datetime
import numpy as np
import pandas as pd
import catboost as cat
import lightgbm as lgb
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier
from scipy.stats import entropy, pearsonr, stats
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from scipy.stats import mode

In [21]:
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", 100)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 100)

path = "./0527/"
feat_path = path + "data/"

In [22]:
def get_app_feats(df):
    phones_app = df[["phone_no_m"]].copy()
    phones_app = phones_app.drop_duplicates(subset=['phone_no_m'], keep='last')
    tmp = df.groupby("phone_no_m")["busi_name"].agg(busi_count="nunique")
    phones_app = phones_app.merge(tmp, on="phone_no_m", how="left")
    """使用的流量统计
    """
    tmp = df.groupby("phone_no_m")["flow"].agg(flow_mean="mean",
                                               flow_median="median",
                                               flow_min="min",
                                               flow_max="max",
                                               flow_var="var",
                                               flow_skew="skew",
                                               flow_std="std",
                                               flow_quantile="quantile",
                                               flow_sem="sem",
                                               flow_sum="sum")
    phones_app = phones_app.merge(tmp, on="phone_no_m", how="left")
    tmp = df.groupby("phone_no_m")["month_id"].agg(month_ids="nunique")
    phones_app = phones_app.merge(tmp, on="phone_no_m", how="left")
    # 月流量使用统计
    phones_app["flow_month"] = phones_app["flow_sum"] / phones_app["month_ids"]
    return phones_app

In [23]:
def get_sms_feats(df):
    df['request_datetime'] = pd.to_datetime(df['request_datetime'])
    df["hour"] = df['request_datetime'].dt.hour
    df["day"] = df['request_datetime'].dt.day

    phone_no_m = df[["phone_no_m"]].copy()
    phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')
    # 对话人数和对话次数
    tmp = df.groupby("phone_no_m")["opposite_no_m"].agg(sms_count="count", sms_nunique="nunique")
    tmp["sms_rate"] = tmp["sms_count"] / tmp["sms_nunique"]
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    """短信下行比例
    """
    calltype2 = df[df["calltype_id"] == 2].copy()
    calltype2 = calltype2.groupby("phone_no_m")["calltype_id"].agg(calltype_2="count")
    phone_no_m = phone_no_m.merge(calltype2, on="phone_no_m", how="left")
    phone_no_m["calltype_rate"] = phone_no_m["calltype_2"] / phone_no_m["sms_count"]
    """短信时间
    """
    tmp = df.groupby("phone_no_m")["hour"].agg(hour_mode=lambda x: mode(x).mode if np.size(mode(x).mode) > 0 else None,
                                                hour_mode_count=lambda x: mode(x).count if np.size(mode(x).mode) > 0 else None,
                                                hour_nunique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    tmp = df.groupby("phone_no_m")["day"].agg(day_mode=lambda x: mode(x).mode if np.size(mode(x).mode) > 0 else None,
                                                day_mode_count=lambda x: mode(x).count if np.size(mode(x).mode) > 0 else None,
                                                day_nunique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    return phone_no_m

In [24]:
def get_voc_feat(df):
    df["start_datetime"] = pd.to_datetime(df['start_datetime'])
    df["hour"] = df['start_datetime'].dt.hour
    df["day"] = df['start_datetime'].dt.day
    phone_no_m = df[["phone_no_m"]].copy()
    phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')
    # 对话人数和对话次数
    tmp = df.groupby("phone_no_m")["opposite_no_m"].agg(opposite_count="count", opposite_unique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    """主叫通话
    """
    df_call = df[df["calltype_id"] == 1].copy()
    tmp = df_call.groupby("phone_no_m")["imei_m"].agg(voccalltype1="count", imeis="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    phone_no_m["voc_calltype1"] = phone_no_m["voccalltype1"] / phone_no_m["opposite_count"]
    tmp = df_call.groupby("phone_no_m")["city_name"].agg(city_name_call="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    tmp = df_call.groupby("phone_no_m")["county_name"].agg(county_name_call="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    """和固定通话者的对话统计
    """
    tmp = df.groupby(["phone_no_m", "opposite_no_m"])["call_dur"].agg(count="count", sum="sum")
    phone2opposite = tmp.groupby("phone_no_m")["count"].agg(phone2opposite_mean="mean"
                                                            , phone2opposite_median="median"
                                                            , phone2opposite_max="max"
                                                            , phone2opposite_min="min"
                                                            , phone2opposite_var="var"
                                                            , phone2opposite_skew="skew"
                                                            , phone2opposite_sem="sem"
                                                            , phone2opposite_std="std"
                                                            , phone2opposite_quantile="quantile"
                                                            )
    phone_no_m = phone_no_m.merge(phone2opposite, on="phone_no_m", how="left")
    phone2opposite = tmp.groupby("phone_no_m")["sum"].agg(phone2oppo_sum_mean="mean"
                                                          , phone2oppo_sum_median="median"
                                                          , phone2oppo_sum_max="max"
                                                          , phone2oppo_sum_min="min"
                                                          , phone2oppo_sum_var="var"
                                                          , phone2oppo_sum_skew="skew"
                                                          , phone2oppo_sum_sem="sem"
                                                          , phone2oppo_sum_std="std"
                                                          , phone2oppo_sum_quantile="quantile"
                                                          )
    phone_no_m = phone_no_m.merge(phone2opposite, on="phone_no_m", how="left")

    """通话时间长短统计
    """
    tmp = df.groupby("phone_no_m")["call_dur"].agg(call_dur_mean="mean"
                                                   , call_dur_median="median"
                                                   , call_dur_max="max"
                                                   , call_dur_min="min"
                                                   , call_dur_var="var"
                                                   , call_dur_skew="skew"
                                                   , call_dur_sem="sem"
                                                   , call_dur_std="std"
                                                   , call_dur_quantile="quantile"
                                                   )
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    tmp = df.groupby("phone_no_m")["city_name"].agg(city_name_nunique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    tmp = df.groupby("phone_no_m")["county_name"].agg(county_name_nunique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    tmp = df.groupby("phone_no_m")["calltype_id"].agg(calltype_id_unique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    #print(phone_no_m)
    """通话时间点偏好
    """
    tmp = df.groupby("phone_no_m")["hour"].agg(hour_mode=lambda x: mode(x).mode if np.size(mode(x).mode) > 0 else None,
                                                hour_mode_count=lambda x: mode(x).count if np.size(mode(x).mode) > 0 else None,
                                                hour_nunique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    tmp = df.groupby("phone_no_m")["day"].agg(day_mode=lambda x: mode(x).mode if np.size(mode(x).mode) > 0 else None,
                                                day_mode_count=lambda x: mode(x).count if np.size(mode(x).mode) > 0 else None,
                                                day_nunique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    return phone_no_m

In [25]:
def feats():
    test_voc = pd.read_csv(path + 'test/test_voc.csv', )
    test_voc_feat = get_voc_feat(test_voc)
    test_voc_feat.to_csv(feat_path + "test_voc_feat.csv", index=False)

    test_app = pd.read_csv(path + 'test/test_app.csv', )
    test_app_feat = get_app_feats(test_app)
    test_app_feat.to_csv(feat_path + "test_app_feat.csv", index=False)

    test_sms = pd.read_csv(path + 'test/test_sms.csv', )
    test_sms_feat = get_sms_feats(test_sms)
    test_sms_feat.to_csv(feat_path + "test_sms_feat.csv", index=False)

    train_voc = pd.read_csv(path + 'train/train_voc.csv', )
    train_voc_feat = get_voc_feat(train_voc)
    train_voc_feat.to_csv(feat_path + "train_voc_feat.csv", index=False)

    train_app = pd.read_csv(path + 'train/train_app.csv', )
    train_app_feat = get_app_feats(train_app)
    train_app_feat.to_csv(feat_path + "train_app_feat.csv", index=False)

    train_sms = pd.read_csv(path + 'train/train_sms.csv', )
    train_sms_feat = get_sms_feats(train_sms)
    train_sms_feat.to_csv(feat_path + "train_sms_feat.csv", index=False)

    #test_vocfs = pd.read_csv(path + 'zpfsdata/test_voc.csv', )
    #test_voc_featfs = get_voc_feat(test_vocfs)
    #test_voc_featfs.to_csv(path + "zpfsdata/test_voc_feat.csv", index=False)

    #test_appfs = pd.read_csv(path + 'zpfsdata/test_app.csv', )
    #test_app_featfs = get_app_feats(test_appfs)
    #test_app_featfs.to_csv(path + "zpfsdata/test_app_feat.csv", index=False)

    #test_smsfs = pd.read_csv(path + 'zpfsdata/test_sms.csv', )
    #test_sms_featfs = get_sms_feats(test_smsfs)
    #test_sms_featfs.to_csv(path + "zpfsdata/test_sms_feat.csv", index=False)

In [26]:
feats()

  if (await self.run_code(code, result,  async_=asy)):


In [27]:
# load april features
test_app_feat = pd.read_csv(feat_path + 'test_app_feat.csv')
test_voc_feat = pd.read_csv(feat_path + 'test_voc_feat.csv')
test_sms_feat = pd.read_csv(feat_path + "test_sms_feat.csv")
test_user = pd.read_csv(path + 'test/test_user.csv')
test_user = test_user.merge(test_app_feat, on="phone_no_m", how="left")
test_user = test_user.merge(test_voc_feat, on="phone_no_m", how="left")
test_user = test_user.merge(test_sms_feat, on="phone_no_m", how="left")
test_user["city_name"] = LabelEncoder().fit_transform(test_user["city_name"].astype(str))
test_user["county_name"] = LabelEncoder().fit_transform(test_user["county_name"].astype(str))
# load april label
#test_user_lb1 = pd.read_csv(path + 'zpfsdata/4yuelabel1.csv')
#test_user_lb2 = pd.read_csv(path + 'zpfsdata/4yuelabel2.csv')
# concat april label and merge with features
#test_user_label = pd.concat([test_user_lb1, test_user_lb2])
#test_user =test_user.merge(test_user_label, on="phone_no_m", how="left")
test_user.rename(columns={"arpu_202004": "arpu_202005"}, inplace=True)

In [28]:
sub = test_user[["phone_no_m"]].copy()

In [29]:
#load train features and label
train_app_feat = pd.read_csv(feat_path + "train_app_feat.csv")
train_voc_feat = pd.read_csv(feat_path + "train_voc_feat.csv")
train_sms_feat = pd.read_csv(feat_path + "train_sms_feat.csv")
train_user=pd.read_csv(path+'train/train_user.csv')
drop_r = ["arpu_201908","arpu_201909","arpu_201910","arpu_201911","arpu_201912","arpu_202001","arpu_202002"]
train_user.drop(drop_r, axis=1,inplace=True)
train_user.rename(columns={"arpu_202003":"arpu_202005"},inplace=True)
train_user = train_user.merge(train_app_feat, on="phone_no_m", how="left")
train_user = train_user.merge(train_voc_feat, on="phone_no_m", how="left")
train_user = train_user.merge(train_sms_feat, on="phone_no_m", how="left")
train_user["city_name"] = LabelEncoder().fit_transform(train_user["city_name"].astype(str))
train_user["county_name"] = LabelEncoder().fit_transform(train_user["county_name"].astype(str))

KeyError: 'phone_no_m'

In [18]:
train_user.to_csv('2.csv')

In [18]:
# concat preli data(train and test)
# train_user = pd.concat([train_user, test_user])
# final label
train_label = train_user[["label"]].copy()

In [19]:
# drop phone_no_m
test_user.drop(["phone_no_m"], axis=1, inplace=True)
train_user.drop(["phone_no_m", "label"], axis=1, inplace=True)

KeyError: "['phone_no_m'] not found in axis"

In [21]:
"""bulid cat lgb xgb model"""
depth = 8
cv = 5

In [26]:
# create catboost model
catclf = cat.CatBoostClassifier(
    allow_writing_files=False
    , od_type='Iter'
    , silent=True
)
# final parameters
cat_grid = {'depth': [depth]
    , 'bootstrap_type': ['Bernoulli']
    , 'od_type': ['Iter']
    , 'l2_leaf_reg': [15]
    , 'learning_rate': [0.1]
    , 'allow_writing_files': [False]
    , 'silent': [True]
            }

# search and fit
catgrid = GridSearchCV(cat.CatBoostClassifier(), param_grid=cat_grid, cv=cv, scoring='f1_macro', n_jobs=-1, verbose=10)
catgrid.fit(train_user, train_label['label'])
train_score = catgrid.score(train_user, train_label['label'])
print(f"训练集得分: {train_score}")

# predict output prob
test_userfs = test_user.fillna(test_user.quantile(0.39))
test_userfs['arpu_202005'] = test_userfs['arpu_202005'].astype(np.float32)
#print(test_user)
#print(test_userfs)
cat_proba = catgrid.predict_proba(test_userfs)
rslt_prob_cat = pd.DataFrame(cat_proba)
rslt_prob_cat.columns = ['lb0', 'lb1']

# 获取最佳参数组合
best_params = catgrid.best_params_
print("Best Parameters:", best_params)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
训练集得分: 1.0
      city_name  county_name  idcard_cnt  arpu_202005  busi_count   flow_mean  flow_median  \
0            11          133           1         9.00           0    0.000000     0.000000   
1             5           38           1          NaN           0    0.000000     0.000000   
2            11          134           1         9.00           5    2.047896     0.015533   
3            16           81           1       138.70           0    0.000000     0.000000   
4            11          143           2        39.00         113   41.574596     0.120334   
5            11          148           1         9.80           0    0.000000     0.000000   
6            16           91           1        20.10           2    0.003933     0.000879   
7            11           76           3          NaN          61    3.401650     0.082480   
8             0            0           1        76.04           3    0.037396     0.

In [23]:
# create lgb model
# final parameters
lgb_grid = {'booster': ['gbdt']
    , 'num_leaves': [31]
    , 'min_child_weight': [4]
    , 'feature_fraction': [0.7]
    , 'bagging_fraction': [0.8]
    , 'bagging_freq': [1]
            }

# search and fit
lgbgrid = GridSearchCV(lgb.LGBMClassifier(), param_grid=lgb_grid, cv=cv, scoring='f1_macro', n_jobs=-1, verbose=10)
lgbgrid.fit(train_user, train_label['label'])
train_score_1 = lgbgrid.score(train_user, train_label['label'])
print(f"训练集得分: {train_score_1}")

# predict output prob
test_userfs_2 = test_user.fillna(test_user.quantile(0.34))
test_userfs['arpu_202005'] = test_userfs['arpu_202005'].astype(np.float32)
lgb_proba = lgbgrid.predict_proba(test_userfs)
rslt_prob_lgb = pd.DataFrame(lgb_proba)
rslt_prob_lgb.columns = ['lb0', 'lb1']

print(lgbgrid.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[LightGBM] [Info] Number of positive: 1962, number of negative: 4144
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002390 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13004
[LightGBM] [Info] Number of data points in the train set: 6106, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.321323 -> initscore=-0.747697
[LightGBM] [Info] Start training from score -0.747697
训练集得分: 0.9945338614647482
{'bagging_fraction': 0.8, 'bagging_freq': 1, 'booster': 'gbdt', 'feature_fraction': 0.7, 'min_child_weight': 4, 'num_leaves': 31}


In [24]:
# create xgb model
# final parameters
from xgboost import XGBClassifier

xgbclf = XGBClassifier(base_score=0.5
                       , booster='gbtree'
                       , colsample_bytree=0.9
                       , learning_rate=0.1
                       , max_depth=8
                       , min_child_weight=7
                       , n_estimators=100
                       , n_jobs=-1
                       , objective='binary:logistic'
                       , subsample=0.75
                       , verbosity=1)
# fit
xgbclf.fit(train_user, train_label['label'])
train_score_2 = xgbclf.score(train_user, train_label['label'])
print(f"训练集得分: {train_score_2}")


# predict output prob
test_userfs_3 = test_user.fillna(test_user.quantile(0.319))
test_userfs['arpu_202005'] = test_userfs['arpu_202005'].astype(np.float32)
xgb_proba = xgbclf.predict_proba(test_userfs)
rslt_prob_xgb = pd.DataFrame(lgb_proba)
rslt_prob_xgb.columns = ['lb0', 'lb1']

训练集得分: 0.980019652800524


In [25]:
"""模型融合"""
"""调整概率输出"""
bestnew112 = 0.3 * rslt_prob_lgb + 0.2 * rslt_prob_xgb + 0.5 * rslt_prob_cat

bestnew112["label"] = bestnew112["lb1"]
bestnew112["label"][bestnew112.label > 60 / 100] = 1
bestnew112["label"][bestnew112.label < 60 / 100] = 0

sub['label'] = bestnew112['label']

print(sub['label'].value_counts())
print(sub['label'].value_counts() / sub.shape[0])

sub.to_csv('new.csv', index=None)

0.0    1570
1.0     475
Name: label, dtype: int64
0.0    0.767726
1.0    0.232274
Name: label, dtype: float64


In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", 100)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 100)

In [2]:
column={'city_name':str, 'county_name': str}

dd_user=pd.read_csv('./0527/train/train_user.csv',dtype=column)
dd_app=pd.read_csv('./0527/train/train_app.csv',dtype=column)
dd_sms=pd.read_csv('./0527/train/train_user.csv',dtype=column)
dd_voc=pd.read_csv('./0527/train/train_voc.csv',dtype=column)

In [3]:
print(f'user:{dd_user.columns}')
print(f'app:{dd_app.columns}')
print(f'sms:{dd_sms.columns}')
print(f'voc:{dd_voc.columns}')

user:Index(['smsphone_no_m', 'city_name', 'county_name', 'idcard_cnt', 'arpu_201908', 'arpu_201909',
       'arpu_201910', 'arpu_201911', 'arpu_201912', 'arpu_202001', 'arpu_202002', 'arpu_202003',
       'label'],
      dtype='object')
app:Index(['phone_no_m', 'busi_name', 'flow', 'month_id'], dtype='object')
sms:Index(['smsphone_no_m', 'city_name', 'county_name', 'idcard_cnt', 'arpu_201908', 'arpu_201909',
       'arpu_201910', 'arpu_201911', 'arpu_201912', 'arpu_202001', 'arpu_202002', 'arpu_202003',
       'label'],
      dtype='object')
voc:Index(['phone_no_m', 'opposite_no_m', 'calltype_id', 'start_datetime', 'call_dur', 'city_name',
       'county_name', 'imei_m'],
      dtype='object')


In [2]:
columns={'city_name':str, 'county_name': str}

dd_user_t=pd.read_csv('./0527/test/test_user.csv',dtype=columns)
dd_app_t=pd.read_csv('./0527/test/test_app.csv',dtype=columns)
dd_sms_t=pd.read_csv('./0527/test/test_user.csv',dtype=columns)
dd_voc_t=pd.read_csv('./0527/test/test_voc.csv',dtype=columns)

In [3]:
print(f'user:{dd_user_t.columns}')
print(f'app:{dd_app_t.columns}')
print(f'sms:{dd_sms_t.columns}')
print(f'voc:{dd_voc_t.columns}')

user:Index(['phone_no_m', 'city_name', 'county_name', 'idcard_cnt', 'arpu_202004'], dtype='object')
app:Index(['phone_no_m', 'busi_name', 'flow', 'month_id'], dtype='object')
sms:Index(['phone_no_m', 'city_name', 'county_name', 'idcard_cnt', 'arpu_202004'], dtype='object')
voc:Index(['phone_no_m', 'opposite_no_m', 'calltype_id', 'start_datetime', 'call_dur', 'city_name',
       'county_name', 'imei_m'],
      dtype='object')
