## 函数定义

In [2]:
from pathlib import Path
import pandas as pd
from datetime import datetime
import os
import numpy as np
import phonenumbers
from datetime import datetime, timedelta
import re
import time
import json
import traceback
from tqdm import tqdm


In [3]:
raw_dir = Path('/home/longxiaolei/raw_mx/newcust')
work_dir = Path('/home/mayongzhi/marshall/sender_word_call_app')

In [17]:
class SmsConfigConstant:
    """
    短信配置常量
    """
    INTERVAL_LIST = [1, 3, 7, 15, 30, 60, 90, 180, 360, 'all']
    TIME_PERIODS = [(0, 5, 'early_morning'), (6, 10, 'morning'), (11, 13, 'noon'), (14, 17, 'afternoon'),
                    (18, 23, 'night')]
    # WEEK_TYPES = ['weekday', 'weekend']


class SenderOverdueRateV1:
    """
    短信中sender的逾期率特征
    """


    @staticmethod
    def extract_sender_rlevel_cnt_features(df, time_col, config_all, apply_time,config_level_bins=10):
        """
        提取 周期内 sender 数量类特征，包括数量和占比
        Args:
            df: input dataframe   经过process_sms_data处理后的数据，columns=['body', 'phone', 'read', 'src_phone', 'time', 'type', 'time_day', 'hour', 'weekday', 'month', 'words', 'sender'])
            time_col: time column
            config_all: config data 配置文件，包含sender、level_freq、level_dist
            config_level_bins: 风险等级分箱数

        Returns:
            feature dict
        """
        feature_dict = {}
        apply_time = pd.to_datetime(apply_time)
        for level_type in ['freq', 'dist']:
            risk_levels = config_all[f'level_{level_type}'].unique().tolist()
            for risk_level in range(1, config_level_bins+1):
                if risk_level not in risk_levels:
                    for time_window in SmsConfigConstant.INTERVAL_LIST:
                        feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_cnt'] = -999
                        feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_ratio'] = -999
                        feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_max'] = -999
                        feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_min'] = -999
                        feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_mean'] = -999
                        feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_std'] = -999
                else:
                    senders = set(config_all[config_all[f'level_{level_type}'] == risk_level]['sender'])
                    for time_window in SmsConfigConstant.INTERVAL_LIST:
                        if time_window == 'all':
                            time_data = df[df[time_col] <= apply_time]
                        else:
                            time_window_ = pd.Timedelta(days=time_window)
                            time_data = df[(df[time_col] >= apply_time - time_window_) & (df[time_col] <= apply_time)]
                        # 去重后的sender数量
                        sender_count_nodup = len(set(time_data['sender']) & senders)
                        # sender数量、占比
                        feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_cnt'] = sender_count_nodup
                        feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_ratio'] = (
                            -99 if len(time_data) == 0 else sender_count_nodup / len(time_data)
                        )
                        filter_sender = time_data[time_data['sender'].apply(lambda x: x in senders)]
                        if filter_sender.empty:
                            for stat in ['max', 'min', 'mean', 'std']:
                                feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_{stat}'] = 0
                        else:
                            sender_count = filter_sender.groupby('sender').size()
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_max'] = sender_count.max()
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_min'] = sender_count.min()
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_mean'] = sender_count.mean()
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_std'] = sender_count.std()
            return feature_dict

    @staticmethod
    def extract_sender_rlevel_time_features(df, time_col, config_all, apply_time,config_level_bins=10):
        """

        提取各风险等级 sender距离apply_time的最大最小时间差
        Args:
            df: input dataframe  经过process_sms_data处理后的数据，columns=['body', 'phone', 'read', 'src_phone', 'time', 'type', 'time_day', 'hour', 'weekday', 'month', 'words', 'sender'])
            time_col: time column
            config_all: config data 配置文件，包含sender、level_freq、level_dist
            config_level_bins: 风险等级分箱数

        Returns:
            feature dict
        """
        feature_dict = {}
        apply_time = pd.to_datetime(apply_time)
        for level_type in ['freq', 'dist']:
            risk_levels = config_all[f'level_{level_type}'].unique().tolist()
            for risk_level in range(1, config_level_bins+1):
                if risk_level not in risk_levels:
                    feature_dict[f'sender_{level_type}_rlevel{risk_level}_time_diff_max'] = -999
                    feature_dict[f'sender_{level_type}_rlevel{risk_level}_time_diff_min'] = -999
                    feature_dict[f'sender_{level_type}_rlevel{risk_level}_time_diff_mean'] = -999
                    feature_dict[f'sender_{level_type}_rlevel{risk_level}_time_diff_std'] = -999
                else:
                    senders = set(config_all[config_all[f'level_{level_type}'] == risk_level]['sender'])
                    feature_dict[f'sender_{level_type}_rlevel{risk_level}_time_diff_max'] = (apply_time - df[df['sender'].apply(lambda x: x in senders)][time_col]).dropna().max().total_seconds() / 3600
                    feature_dict[f'sender_{level_type}_rlevel{risk_level}_time_diff_min'] = (apply_time - df[df['sender'].apply(lambda x: x in senders)][time_col]).dropna().min().total_seconds() / 3600
                    feature_dict[f'sender_{level_type}_rlevel{risk_level}_time_diff_mean'] = (apply_time - df[df['sender'].apply(lambda x: x in senders)][time_col]).dropna().mean().total_seconds() / 3600
                    feature_dict[f'sender_{level_type}_rlevel{risk_level}_time_diff_std'] = (apply_time - df[df['sender'].apply(lambda x: x in senders)][time_col]).dropna().std().total_seconds() / 3600
        return feature_dict

    @staticmethod
    def extract_sender_rlevel_continuous_day_features(df, time_col, config_all, apply_time, config_level_bins=10):
        """
        提取各风险等级 sender连续出现天数的最大值、最小值、平均值、方差
        Args:
            df: input dataframe
            time_col: time column
            config_all: config data

        Returns:
            feature dict
        """
        feature_dict = {}
        apply_time = pd.to_datetime(apply_time)
        for level_type in ['freq', 'dist']:
            risk_levels = config_all[f'level_{level_type}'].unique().tolist()
            for risk_level in range(1, config_level_bins+1):
                if risk_level not in risk_levels:
                    for time_window in SmsConfigConstant.INTERVAL_LIST:
                        feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_max'] = -999
                        feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_min'] = -999
                        feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_mean'] = -999
                        feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_std'] = -999
                else:
                    senders = set(config_all[config_all[f'level_{level_type}'] == risk_level]['sender'])
                    for time_window in SmsConfigConstant.INTERVAL_LIST:
                        if time_window == 'all':
                            time_data = df[df[time_col] <= apply_time]
                        else:
                            time_window_ = pd.Timedelta(days=time_window)
                            time_data = df[(df[time_col] >= apply_time - time_window_) & (df[time_col] <= apply_time)]
                        time_data = time_data.sort_values(by='time_day', ascending=True)
                        time_data.reset_index(drop=True, inplace=True)
                        filtered_data = time_data[time_data['sender'].apply(lambda x: x in senders)]
                        if filtered_data.empty:
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_max'] = 0
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_min'] = 0
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_mean'] = 0
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_std'] = 0
                        elif filtered_data['time_day'].nunique() == 1:
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_max'] = 1
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_min'] = 1
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_mean'] = 1
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_std'] = 0
                        else:
                            time_day_list = filtered_data['time_day'].unique()
                            continuous_day = 1
                            continuous_day_list = []
                            for i in range(1, len(time_day_list)):
                                if (time_day_list[i] - time_day_list[i - 1]).days == 1:
                                    continuous_day += 1
                                else:
                                    continuous_day_list.append(continuous_day)
                                    continuous_day = 1
                            continuous_day_list.append(continuous_day)
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_max'] = max(continuous_day_list)
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_min'] = min(continuous_day_list)
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_mean'] = np.mean(continuous_day_list)
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_std'] = np.std(continuous_day_list)
        return feature_dict

    @staticmethod
    def extract_sender_rlevel_shift_diff_features(df, time_col, config_all, apply_time, config_level_bins=10):
        """
        提取各时间窗口、各风险等级 sender出现频次的环比和差，即相邻时间窗口的 sender出现频次的比值和差值
        Args:
            df: input dataframe
            time_col: time column
            config_all: config data
            config_level_bins: 风险等级分箱数
            

        Returns:
            feature dict
        """
        feature_dict = {}
        apply_time = pd.to_datetime(apply_time)
        for level_type in ['freq', 'dist']:
            risk_levels = config_all[f'level_{level_type}'].unique().tolist()
            for risk_level in range(1, config_level_bins+1):
                if risk_level not in risk_levels:
                    for time_window in SmsConfigConstant.INTERVAL_LIST:
                        if time_window == 'all':
                            continue
                        feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_shift_diff'] = -999
                        feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_shift_ratio'] = -999
                else:
                    senders = set(config_all[config_all[f'level_{level_type}'] == risk_level]['sender'])
                    for time_window in SmsConfigConstant.INTERVAL_LIST:
                        if time_window == 'all':
                            continue
                        time_window_ = pd.Timedelta(days=time_window)
                        time_data = df[(df[time_col] >= apply_time - 2 * time_window_) & (df[time_col] <= apply_time)]
                        now_window_num = len(set(time_data[time_data[time_col] > apply_time - time_window_]['sender']) & senders)
                        last_window_num = len(set(time_data[ time_data[time_col].between(apply_time - 2 * time_window_, apply_time - time_window_)]['sender']) & senders)
                        feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_shift_diff'] = now_window_num - last_window_num
                        if last_window_num == 0:
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_shift_ratio'] = -99
                        else:
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_shift_ratio'] = now_window_num / last_window_num
        return feature_dict

    @staticmethod
    def extract_sender_rlevel_time_period_features(df, time_col, config_all, apply_time, config_level_bins=10):
        """
        提取各风险等级 sender出现频次的时间段特征
        Args:
            df: input dataframe
            time_col: time column
            config_all: config data
            config_level_bins: 风险等级分箱数

        Returns:
            feature dict
        """
        feature_dict = {}
        apply_time = pd.to_datetime(apply_time)
        for level_type in ['freq', 'dist']:
            risk_levels = config_all[f'level_{level_type}'].unique().tolist()
            for risk_level in range(1, config_level_bins+1):
                if risk_level not in risk_levels:
                    for time_window in SmsConfigConstant.INTERVAL_LIST:
                        for time_period in SmsConfigConstant.TIME_PERIODS:
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_cnt'] = -999
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_ratio'] = -999
                else:
                    senders = set(config_all[config_all[f'level_{level_type}'] == risk_level]['sender'])
                    for time_window in SmsConfigConstant.INTERVAL_LIST:
                        if time_window == 'all':
                            time_data = df[df[time_col] <= apply_time]
                        else:
                            time_window_ = pd.Timedelta(days=time_window)
                            time_data = df[(df[time_col] >= apply_time - time_window_) & (df[time_col] <= apply_time)]
                        for time_period in SmsConfigConstant.TIME_PERIODS:
                            time_data_period = time_data[time_data['hour'].between(time_period[0], time_period[1])]
                            feature_dict[f'sender_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_cnt'] = len(
                                set(time_data_period['sender']) & senders)
                            if len(time_data_period) == 0:
                                feature_dict[f'sender_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_ratio'] = -99
                            else:
                                feature_dict[f'sender_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_ratio'] = len(
                                    set(time_data_period['sender']) & senders) / len(time_data_period)
        return feature_dict
    
    @staticmethod
    def _if_df_empty(config_level_bins:int=10):
        """
        如果df为空，所有特征返回-999
        :return: 
        
        """
        
        features = {}
        for level_type in ['freq', 'dist']:
            for risk_level in range(1, config_level_bins+1):
                features[f'sender_{level_type}_rlevel{risk_level}_time_diff_max'] = -999
                features[f'sender_{level_type}_rlevel{risk_level}_time_diff_min'] = -999
                features[f'sender_{level_type}_rlevel{risk_level}_time_diff_mean'] = -999
                features[f'sender_{level_type}_rlevel{risk_level}_time_diff_std'] = -999
                for time_window in SmsConfigConstant.INTERVAL_LIST:
                    features[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_cnt'] = -999
                    features[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_ratio'] = -999
                    features[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_max'] = -999
                    features[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_min'] = -999
                    features[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_mean'] = -999
                    features[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_std'] = -999
                    features[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_max'] = -999
                    features[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_min'] = -999
                    features[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_mean'] = -999
                    features[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_continuous_std'] = -999
                    features[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_shift_diff'] = -999
                    features[f'sender_{level_type}_rlevel{risk_level}_d{time_window}_shift_ratio'] = -999
                    for time_period in SmsConfigConstant.TIME_PERIODS:
                        features[f'sender_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_cnt'] = -999
                        features[f'sender_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_ratio'] = -999
        return features

class WordOverdueRateV1:
    """
    短信中word的逾期率特征
    """
    @staticmethod
    def extract_word_rlevel_cnt_features(df, time_col, config_all, apply_time, config_level_bins=10):
        """
        提取 周期内 word 数量类特征，包括数量和占比
        Args:
            df: input dataframe
            time_col: time column
            config_all: config data
            config_level_bins: 风险等级分箱数

        Returns:
            feature dict
        """
        feature_dict = {}
        apply_time = pd.to_datetime(apply_time)
        for level_type in ['freq', 'dist']:
            risk_levels = config_all[f'level_{level_type}'].unique().tolist()
            for risk_level in range(1, config_level_bins + 1):
                if risk_level not in risk_levels:
                    for time_window in SmsConfigConstant.INTERVAL_LIST:
                        
                        feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_cnt'] = -999
                        feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_ratio'] = -999
                        feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_max'] = -999
                        feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_min'] = -999
                        feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_mean'] = -999
                        feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_std'] = -999
                else:
                    words = set(config_all[config_all[f'level_{level_type}'] == risk_level]['word'])
                    for time_window in SmsConfigConstant.INTERVAL_LIST:
                        if time_window == 'all':
                            time_data = df[df[time_col] <= apply_time]
                        else:
                            time_window_ = pd.Timedelta(days=time_window)
                            time_data = df[(df[time_col] >= apply_time - time_window_) & (df[time_col] <= apply_time)]
                        word_count_nodup = sum([len(words & set(x)) for x in time_data['words']])
                        feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_cnt'] = word_count_nodup

                        if sum([len(x) for x in time_data['words']]) == 0:
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_ratio'] = -99
                        else:
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_ratio'] = word_count_nodup / sum(
                                [len(x) for x in time_data['words']])

                        filtered_data = time_data[time_data['words'].apply(lambda x: len(set(x) & words) > 0)]
                        if filtered_data.empty:
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_max'] = 0
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_min'] = 0
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_mean'] = 0
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_std'] = 0
                        else:
                            word_count = time_data[time_data['words'].apply(lambda x: len(set(x) & words) > 0)].explode('words').groupby('words').size()
                            word_count = word_count.sort_values(ascending=False)
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_max'] = word_count.max()
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_min'] = word_count.min()
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_mean'] = word_count.mean()
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_std'] = word_count.std()
        return feature_dict

    @staticmethod
    def extract_word_rlevel_time_features(df, time_col, config_all, apply_time, config_level_bins=10):
        """

        提取各风险等级 word距离apply_time的最大最小时间差
        Args:
            df: input dataframe
            time_col: time column
            config_all: config data
            config_level_bins: 风险等级分箱数

        Returns:
            feature dict
        """
        feature_dict = {}
        apply_time = pd.to_datetime(apply_time)
        for level_type in ['freq', 'dist']:
            risk_levels = config_all[f'level_{level_type}'].unique().tolist()
            for risk_level in range(1, config_level_bins + 1):
                if risk_level not in risk_levels:
                    feature_dict[f'word_{level_type}_rlevel{risk_level}_time_diff_max'] = -999
                    feature_dict[f'word_{level_type}_rlevel{risk_level}_time_diff_min'] = -999
                    feature_dict[f'word_{level_type}_rlevel{risk_level}_time_diff_mean'] = -999
                    feature_dict[f'word_{level_type}_rlevel{risk_level}_time_diff_std'] = -999
                else:
                    words = set(config_all[config_all[f'level_{level_type}'] == risk_level]['word'])
                    feature_dict[f'word_{level_type}_rlevel{risk_level}_time_diff_max'] = (apply_time - df[df['words'].apply(lambda x: len(set(x) & words) > 0)][time_col]).dropna().max().total_seconds() / 3600
                    feature_dict[f'word_{level_type}_rlevel{risk_level}_time_diff_min'] = (apply_time - df[df['words'].apply(lambda x: len(set(x) & words) > 0)][time_col]).dropna().min().total_seconds() / 3600
                    feature_dict[f'word_{level_type}_rlevel{risk_level}_time_diff_mean'] = (apply_time - df[df['words'].apply(lambda x: len(set(x) & words) > 0)][time_col]).dropna().mean().total_seconds() / 3600
                    feature_dict[f'word_{level_type}_rlevel{risk_level}_time_diff_std'] = (apply_time - df[df['words'].apply(lambda x: len(set(x) & words) > 0)][time_col]).dropna().std().total_seconds() / 3600
        return feature_dict

    @staticmethod
    def extract_word_rlevel_continuous_day_features(df, time_col, config_all, apply_time, config_level_bins=10):
        """
        提取各风险等级 word连续出现天数的最大值、最小值、平均值、方差
        Args:
            df: input dataframe
            time_col: time column
            config_all: config data
            config_level_bins: 风险等级分箱数

        Returns:
            feature dict
        """
        feature_dict = {}
        apply_time = pd.to_datetime(apply_time)
        for level_type in ['freq', 'dist']:
            risk_levels = config_all[f'level_{level_type}'].unique().tolist()
            for risk_level in range(1, config_level_bins + 1):
                if risk_level not in risk_levels:
                    for time_window in SmsConfigConstant.INTERVAL_LIST:
                        feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_max'] = -999
                        feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_min'] = -999
                        feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_mean'] = -999
                        feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_std'] = -999
                else:
                    words = set(config_all[config_all[f'level_{level_type}'] == risk_level]['word'])
                    for time_window in SmsConfigConstant.INTERVAL_LIST:
                        if time_window == 'all':
                            time_data = df[df[time_col] <= apply_time]
                        else:
                            time_window_ = pd.Timedelta(days=time_window)
                            time_data = df[(df[time_col] >= apply_time - time_window_) & (df[time_col] <= apply_time)]
                        time_data = time_data.sort_values(by='time_day', ascending=True)
                        time_data.reset_index(drop=True, inplace=True)
                        filtered_data = time_data[time_data['words'].apply(lambda x: len(set(x) & words) > 0)]
                        if filtered_data.empty:
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_max'] = 0
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_min'] = 0
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_mean'] = 0
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_std'] = 0
                        elif filtered_data['time_day'].nunique() == 1:
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_max'] = 1
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_min'] = 1
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_mean'] = 1
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_std'] = 0
                        else:
                            time_day_list = filtered_data['time_day'].unique()
                            continuous_day = 1
                            continuous_day_list = []
                            for i in range(1, len(time_day_list)):
                                if (time_day_list[i] - time_day_list[i - 1]).days == 1:
                                    continuous_day += 1
                                else:
                                    continuous_day_list.append(continuous_day)
                                    continuous_day = 1
                            continuous_day_list.append(continuous_day)
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_max'] = max(
                                continuous_day_list)
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_min'] = min(
                                continuous_day_list)
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_mean'] = np.mean(
                                continuous_day_list)
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_std'] = np.std(
                                continuous_day_list)
        return feature_dict

    @staticmethod
    def extract_word_rlevel_shift_diff_features(df, time_col, config_all, apply_time, config_level_bins=10):
        """
        提取各时间窗口、各风险等级 word出现频次的环比和差，即相邻时间窗口的 word出现频次的比值和差值
        Args:
            df: input dataframe
            time_col: time column
            config_all: config data
            config_level_bins: 风险等级分箱数

        Returns:
            feature dict
        """
        feature_dict = {}
        apply_time = pd.to_datetime(apply_time)
        for level_type in ['freq', 'dist']:
            risk_levels = config_all[f'level_{level_type}'].unique().tolist()
            for risk_level in range(1, config_level_bins + 1):
                if risk_level not in risk_levels:
                    for time_window in SmsConfigConstant.INTERVAL_LIST:
                        if time_window == 'all':
                            continue
                        feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_shift_diff'] = -999
                        feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_shift_ratio'] = -999
                else:
                    words = set(config_all[config_all[f'level_{level_type}'] == risk_level]['word'])
                    for time_window in SmsConfigConstant.INTERVAL_LIST:
                        if time_window == 'all':
                            continue
                        time_window_ = pd.Timedelta(days=time_window)
                        time_data = df[(df[time_col] >= apply_time - 2 * time_window_) & (df[time_col] <= apply_time)]
                        now_window_num = sum([len(set(words_) & words) for words_ in
                                             time_data[time_data[time_col] > apply_time - time_window_]['words']])
                        last_window_num = sum([len(set(words_) & words) for words_ in
                                              time_data[ time_data[time_col].between(apply_time - 2 * time_window_, apply_time - time_window_)]['words']])
                        feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_shift_diff'] = now_window_num - last_window_num
                        if last_window_num == 0:
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_shift_ratio'] = -99
                        else:
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_d{time_window}_shift_ratio'] = now_window_num / last_window_num
        return feature_dict

    @staticmethod
    def extract_word_rlevel_time_period_features(df, time_col, config_all, apply_time, config_level_bins=10):
        """
        提取不同time_periods、不同风险等级的word出现频次
        Args:
            df: input dataframe
            time_col: time column
            config_all: config data
            config_level_bins: 风险等级分箱数

        Returns:
            feature dict
        """
        feature_dict = {}
        apply_time = pd.to_datetime(apply_time)
        for level_type in ['freq', 'dist']:
            risk_levels = config_all[f'level_{level_type}'].unique().tolist()
            for risk_level in range(1, config_level_bins + 1):
                if risk_level not in risk_levels:
                    for time_window in SmsConfigConstant.INTERVAL_LIST:
                        for time_period in SmsConfigConstant.TIME_PERIODS:
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_cnt'] = -999
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_ratio'] = -999
                else:
                    words = set(config_all[config_all[f'level_{level_type}'] == risk_level]['word'])
                    for time_window in SmsConfigConstant.INTERVAL_LIST:
                        if time_window == 'all':
                            time_data = df[df[time_col] <= apply_time]
                        else:
                            time_window_ = pd.Timedelta(days=time_window)
                            time_data = df[(df[time_col] >= apply_time - time_window_) & (df[time_col] <= apply_time)]
                        for time_period in SmsConfigConstant.TIME_PERIODS:
                            time_data_period = time_data[time_data['hour'].between(time_period[0], time_period[1])]
                            feature_dict[f'word_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_cnt'] = \
                                sum([len(set(words_) & words) for words_ in time_data_period['words']])

                            if sum([len(words) for words in time_data_period['words']]) == 0:
                                feature_dict[f'word_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_ratio'] = -99
                            else:
                                feature_dict[f'word_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_ratio'] = (
                                        feature_dict[f'word_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_cnt']
                                        / sum([len(words) for words in time_data_period['words']])
                                )
        return feature_dict
    
    @staticmethod
    def _is_df_empty(config_level_bins:int=10):
        """
        如果df为空，所有特征返回-999
        :return: 
        """
        features = {}
        for level_type in ['freq', 'dist']:
            for risk_level in range(1, config_level_bins+1):
                features[f'word_{level_type}_rlevel{risk_level}_time_diff_max'] = -999
                features[f'word_{level_type}_rlevel{risk_level}_time_diff_min'] = -999
                features[f'word_{level_type}_rlevel{risk_level}_time_diff_mean'] = -999
                features[f'word_{level_type}_rlevel{risk_level}_time_diff_std'] = -999
                for time_window in SmsConfigConstant.INTERVAL_LIST:
                    features[f'word_{level_type}_rlevel{risk_level}_d{time_window}_cnt'] = -999
                    features[f'word_{level_type}_rlevel{risk_level}_d{time_window}_ratio'] = -999
                    features[f'word_{level_type}_rlevel{risk_level}_d{time_window}_max'] = -999
                    features[f'word_{level_type}_rlevel{risk_level}_d{time_window}_min'] = -999
                    features[f'word_{level_type}_rlevel{risk_level}_d{time_window}_mean'] = -999
                    features[f'word_{level_type}_rlevel{risk_level}_d{time_window}_std'] = -999
                    features[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_max'] = -999
                    features[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_min'] = -999
                    features[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_mean'] = -999
                    features[f'word_{level_type}_rlevel{risk_level}_d{time_window}_continuous_std'] = -999
                    if time_window != 'all':
                        features[f'word_{level_type}_rlevel{risk_level}_d{time_window}_shift_diff'] = -999
                        features[f'word_{level_type}_rlevel{risk_level}_d{time_window}_shift_ratio'] = -999
                    for time_period in SmsConfigConstant.TIME_PERIODS:
                        features[f'word_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_cnt'] = -999
                        features[f'word_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_ratio'] = -999
        return features

In [18]:
class AppConfigConstant:
    """
    app配置常量
    """
    INTERVAL_LIST = [1, 3, 7, 15, 30, 60, 90, 180, 360, 'all']
    TIME_PERIODS = [(0, 5, 'early_morning'), (6, 10, 'morning'), (11, 13, 'noon'), (14, 17, 'afternoon'),
                    (18, 23, 'night')]
    # WEEK_TYPES = ['weekday', 'weekend']


class AppOverdueRateV1:
    """
    app逾期率特征
    """
    
    @staticmethod
    def extract_app_rlevel_cnt_features(df: pd.DataFrame, time_col: str, config_all: pd.DataFrame,apply_time: str,config_level_bins: int = 10):
        """
        提取app_rlevel_cnt特征
        Args:
            df: 数据
            time_col: 时间列
            config_all: app配置数据 包含appx、level_freq、level_dist
            apply_time: 申请时间
            config_level_bins: 风险等级分箱数
        Returns:
            app_rlevel_cnt特征
        """
        feature_dict = {}
        apply_time = pd.to_datetime(apply_time)
        for level_type in ['freq', 'dist']:
            risk_levels = config_all[f'level_{level_type}'].unique().tolist()
            for risk_level in range(1, config_level_bins + 1):
                if risk_level not in risk_levels:
                    for time_window in AppConfigConstant.INTERVAL_LIST:
                        feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_cnt'] = -999
                        feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_ratio'] = -999
                else:
                    apps = set(config_all[config_all[f'level_{level_type}'] == risk_level]['app'])
                    for time_window in AppConfigConstant.INTERVAL_LIST:
                        if time_window == 'all':
                            time_data = df[df[time_col] <= apply_time]
                        else:
                            time_window_ = pd.Timedelta(days=time_window)
                            time_data = df[(df[time_col] >= apply_time - time_window_) & (df[time_col] <= apply_time)]
                    
                        app_count = len(set(time_data[time_data['app_package'].isin(apps)]['app_package']))
                        feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_cnt'] = app_count
                        if len(time_data) == 0:
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_ratio'] = -99
                        else:
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_ratio'] = app_count / len(time_data)
        return feature_dict

    @staticmethod
    def extract_app_rlevel_time_features( df: pd.DataFrame,time_col: str,config_all: pd.DataFrame,apply_time: str, config_level_bins: int = 10):
        """
        提取app_rlevel_time特征
        Args:
            df: 数据
            time_col: 时间列, fi_time / lu_time
            config_all: app配置数据
            apply_time: 申请时间
            config_level_bins: 风险等级分箱数
        Returns:
            app_rlevel_time特征
        """
        feature_dict = {}
        apply_time = pd.to_datetime(apply_time)
        for level_type in ['freq', 'dist']:
            risk_levels = config_all[f'level_{level_type}'].unique().tolist()
            for risk in range(1, config_level_bins + 1):
                if risk not in risk_levels:
                    feature_dict[f'app_{level_type}_rlevel{risk}_time_diff_max'] = -999
                    feature_dict[f'app_{level_type}_rlevel{risk}_time_diff_min'] = -999
                    feature_dict[f'app_{level_type}_rlevel{risk}_time_diff_mean'] = -999
                    feature_dict[f'app_{level_type}_rlevel{risk}_time_diff_std'] = -999
                else:
                    apps = set(config_all[config_all[f'level_{level_type}'] == risk]['app'])
                    feature_dict[f'app_{level_type}_rlevel{risk}_time_diff_max'] = (apply_time - df[df['app_package'].isin(apps)][time_col]).dropna().max().total_seconds() / 3600
                    feature_dict[f'app_{level_type}_rlevel{risk}_time_diff_min'] = (apply_time - df[df['app_package'].isin(apps)][time_col]).dropna().min().total_seconds() / 3600
                    feature_dict[f'app_{level_type}_rlevel{risk}_time_diff_mean'] = (apply_time - df[df['app_package'].isin(apps)][time_col]).dropna().mean().total_seconds() / 3600
                    feature_dict[f'app_{level_type}_rlevel{risk}_time_diff_std'] = (apply_time - df[df['app_package'].isin(apps)][time_col]).dropna().std().total_seconds() / 3600
        return feature_dict

    @staticmethod
    def extract_app_rlevel_continuous_day_features(df: pd.DataFrame,time_col: str,config_all: pd.DataFrame,apply_time: str,config_level_bins: int = 10):
        """
        提取app_rlevel_continunous_day特征
        Args:
            df: 数据
            time_col: 时间列
            config_all: app配置数据
            apply_time: 申请时间
            config_level_bins: 风险等级分箱数
        Returns:
            app_rlevel_continunous_day特征
        """
        feature_dict = {}
        apply_time = pd.to_datetime(apply_time)
        for level_type in ['freq', 'dist']:
            risk_levels = config_all[f'level_{level_type}'].unique().tolist()
            for risk_level in range(1, config_level_bins + 1):
                if risk_level not in risk_levels:
                    for time_window in AppConfigConstant.INTERVAL_LIST:
                        feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_max'] = -999
                        feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_min'] = -999
                        feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_mean'] = -999
                        feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_std'] = -999
                else:
                    apps = set(config_all[config_all[f'level_{level_type}'] == risk_level]['app'])
                    for time_window in AppConfigConstant.INTERVAL_LIST:
                        if time_window == 'all':
                            time_data = df[df[time_col] <= apply_time]
                        else:
                            time_window_ = pd.Timedelta(days=time_window)
                            time_data = df[(df[time_col] >= apply_time - time_window_) & (df[time_col] <= apply_time)]
                        time_data = time_data.sort_values(by='fi_day', ascending=False)
                        time_data.reset_index(drop=True, inplace=True)
                       
                        time_day_list = time_data[time_data['app_package'].isin(apps)]['fi_day'].unique()
                        if len(time_day_list) == 0:
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_max'] = 0
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_min'] = 0
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_mean'] = 0
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_std'] = 0
                        elif len(time_day_list) == 1:
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_max'] = 1
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_min'] = 1
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_mean'] = 1
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_std'] = 0
                        else:
                            continunous_day = 1
                            continunous_day_list = []
                            for i in range(1, len(time_day_list)):
                                if (time_day_list[i - 1] - time_day_list[i]).days == 1:
                                    continunous_day += 1
                                else:
                                    continunous_day_list.append(continunous_day)
                                    continunous_day = 1
                            continunous_day_list.append(continunous_day)
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_max'] = max(continunous_day_list)
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_min'] = min(continunous_day_list)
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_mean'] = np.mean(continunous_day_list)
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_std'] = np.std(continunous_day_list)
        return feature_dict

    @staticmethod
    def extract_app_rlevel_shift_diff_features(df: pd.DataFrame,time_col: str,config_all: pd.DataFrame,apply_time: str,config_level_bins: int = 10):
        """
        提取app_rlevel_shift_diff特征
        Args:
            df: 数据
            time_col: 时间列
            config_all: app配置数据
            apply_time: 申请时间
        Returns:
            app_rlevel_shift_diff特征
        """
        feature_dict = {}
        apply_time = pd.to_datetime(apply_time)
        for level_type in ['freq', 'dist']:
            risk_levels = config_all[f'level_{level_type}'].unique().tolist()
            for risk_level in range(1, config_level_bins + 1):
                if risk_level not in risk_levels:
                    for time_window in AppConfigConstant.INTERVAL_LIST:
                        if time_window == 'all':
                            continue
                        feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_shift_diff'] = -999
                        feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_shift_ratio'] = -999
                else:
                    apps = set(config_all[config_all[f'level_{level_type}'] == risk_level]['app'])
                    for time_window in AppConfigConstant.INTERVAL_LIST:
                        if time_window == 'all':
                            continue
                        time_window_ = pd.Timedelta(days=time_window)
                        time_data = df[(df[time_col] >= apply_time - 2 * time_window_) & (df[time_col] <= apply_time)]
                        now_window_num = len(set(time_data[time_data[time_col] > apply_time - time_window_]['app_package']) & apps)
                        last_window_num = len(set(time_data[time_data[time_col].between(apply_time - 2 * time_window_, apply_time - time_window_)]['app_package']) & apps)
                        feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_shift_diff'] = now_window_num - last_window_num
                        if last_window_num == 0:
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_shift_ratio'] = -99
                        else:
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_d{time_window}_shift_ratio'] = (now_window_num - last_window_num) / last_window_num
            feature_dict = dict(sorted(feature_dict.items(), key=lambda x: x[0]))
        return feature_dict

    @staticmethod
    def extract_app_rlevel_time_period_features(df: pd.DataFrame,time_col: str,config_all: pd.DataFrame,apply_time: str,config_level_bins: int = 10):
        """
        提取app_rlevel_time_period特征
        Args:
            df: 数据
            time_col: 时间列
            config_all: app配置数据
            apply_time: 申请时间
            config_level_bins: 风险等级分箱数
        Returns:
            app_rlevel_time_period特征
        """
        feature_dict = {}
        apply_time = pd.to_datetime(apply_time)
        for level_type in ['freq', 'dist']:
            risk_levels = config_all[f'level_{level_type}'].unique().tolist()
            for risk_level in range(1, config_level_bins + 1):
                if risk_level not in risk_levels:
                    for time_window in AppConfigConstant.INTERVAL_LIST:
                        for time_period in AppConfigConstant.TIME_PERIODS:
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_cnt'] = -999
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_ratio'] = -999
                else:
                    apps = set(config_all[config_all[f'level_{level_type}'] == risk_level]['app'])
                    for time_window in AppConfigConstant.INTERVAL_LIST:
                        if time_window == 'all':
                            time_data = df[df[time_col] <= apply_time]
                        else:
                            time_window_ = pd.Timedelta(days=time_window)
                            time_data = df[(df[time_col] >= apply_time - time_window_) & (df[time_col] <= apply_time)]
                        for time_period in AppConfigConstant.TIME_PERIODS:
                            time_data_period = time_data[(time_data['fi_hour'].between(time_period[0], time_period[1]))]
                            feature_dict[f'app_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_cnt'] = len(
                                set(time_data_period['app_package']) & apps)
                            if len(time_data_period) == 0:
                                feature_dict[f'app_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_ratio'] = -99
                            else:
                                feature_dict[f'app_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_ratio'] = len(
                                    set(time_data_period['app_package']) & apps) / len(time_data_period)
        return feature_dict
    
    @staticmethod
    def _if_df_empty(config_level_bins:int=10):
        """
        如果df为空，所有特征返回-999
        :param config_level_bins: 风险等级分箱数
        :return: 
        """
        features = {}
        for level_type in ['freq', 'dist']:
            for risk_level in range(1, config_level_bins+1):
                # rlevel_time
                features[f'app_{level_type}_rlevel{risk_level}_time_diff_max'] = -999
                features[f'app_{level_type}_rlevel{risk_level}_time_diff_min'] = -999
                features[f'app_{level_type}_rlevel{risk_level}_time_diff_mean'] = -999
                features[f'app_{level_type}_rlevel{risk_level}_time_diff_std'] = -999
                for time_window in AppConfigConstant.INTERVAL_LIST:
                    # rlevel_cnt
                    features[f'app_{level_type}_rlevel{risk_level}_d{time_window}_cnt'] = -999
                    features[f'app_{level_type}_rlevel{risk_level}_d{time_window}_ratio'] = -999
                    # rlevel_continuous_day
                    features[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_max'] = -999
                    features[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_min'] = -999
                    features[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_mean'] = -999
                    features[f'app_{level_type}_rlevel{risk_level}_d{time_window}_continunous_std'] = -999
                    # rlevel_shift_diff
                    features[f'app_{level_type}_rlevel{risk_level}_d{time_window}_shift_diff'] = -999
                    features[f'app_{level_type}_rlevel{risk_level}_d{time_window}_shift_ratio'] = -999
                    for time_period in AppConfigConstant.TIME_PERIODS:
                        # rlevel_time_period
                        features[f'app_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_cnt'] = -999
                        features[f'app_{level_type}_rlevel{risk_level}_{time_period[2]}_d{time_window}_ratio'] = -999
            return features
        

In [6]:
class GenericConfigConstant:
    """
    generic config file
    """
    EPS = 1e-30
    COUNTRY_ID = ["mx", "cl", "pe", "co", "ec", "th", "ph", "in", "id", "ng", "tz"]
    COUNTRY_CODE = {"mx": "+52",
                    "cl": "+56",
                    "pe": "+51",
                    "co": "+57",
                    "ec": "+593",
                    "th": "+66",
                    "ph": "+63",
                    "in": "+91",
                    "id": "+62",
                    "ng": "+234",
                    "tz": "+255"}
    COUNTRY_PHONE_LEN = {"mx": 10,
                         "cl": 10,
                         "pe": 10,
                         "co": 10,
                         "ec": 10,
                         "th": 10,
                         "ph": 10,
                         "in": 10,
                         "id": 10,
                         "ng": 10,
                         "tz": 10}

    COUNTRY_TIME_ZONE = {"mx": -6,
                         "cl": -4,
                         "pe": -5,
                         "co": -5,
                         "ec": -5,
                         "th": 7,
                         "ph": 8,
                         "id": 7,
                         "in": 5.5,
                         "ng": 1,
                         "tz": 3}

In [7]:
def time_trans(time, country_id):
    """
    时间转换
    :param time:  时间
    :param country_id: 国家id，如'mx'
    :return:  格式化后的时间
    """
    tz = GenericConfigConstant.COUNTRY_TIME_ZONE[country_id.lower()]
    time_len = len(str(time))
    time_format = "%a %b %d %H:%M:%S GMT%z %Y"
    if time_len == 10:
        format_time = datetime.utcfromtimestamp(int(time)) + timedelta(hours=tz)
    elif time_len == 13:
        format_time = datetime.utcfromtimestamp(int(time) // 1000) + timedelta(hours=tz)
    elif time_len == 34:
        format_time = datetime.strptime(str(time), time_format)
    else:
        format_time = datetime(2099, 12, 31, 23, 59, 59)
    return format_time


def process_app_data(app_data, apply_time,country_id):
    """
    处理传入的app数据，list of json
    Args:
        app_data:app数据
        apply_time:申请时间
        country_id:国家id
    """

    assert country_id in GenericConfigConstant.COUNTRY_ID, "country id not in list, Please input correct country id"
    if app_data is None or app_data.shape[0] == 0:
        return pd.DataFrame(columns=['app_name', 'fi_time', 'lu_time', 'fi_day', 'fi_hour', 'fi_weekday', 'fi_month',
                                     'lu_day', 'lu_hour', 'lu_weekday', 'lu_month'])

    # app_data = json.loads(app_data)
    # app_df = pd.DataFrame(app_data)
    # 将时间戳转成country_id对应的时间
    app_data['fi_time'] = pd.to_datetime(app_data['fi_time'], unit='ms') + pd.Timedelta(hours=GenericConfigConstant.COUNTRY_TIME_ZONE[country_id])
    app_data['lu_time'] = pd.to_datetime(app_data['lu_time'], unit='ms') + pd.Timedelta(hours=GenericConfigConstant.COUNTRY_TIME_ZONE[country_id])
    app_data = app_data[app_data['fi_time'] < apply_time]

    # 每个人选取最近的3000条短信
    app_data = app_data.sort_values(by='fi_time', ascending=False).head(3000)

    # 时间相关处理
    app_data['fi_day'] = app_data['fi_time'].dt.date
    app_data['fi_hour'] = app_data['fi_time'].dt.hour
    app_data['fi_weekday'] = app_data['fi_time'].dt.weekday
    app_data['fi_month'] = app_data['fi_time'].dt.month

    app_data['lu_day'] = app_data['lu_time'].dt.date
    app_data['lu_hour'] = app_data['lu_time'].dt.hour
    app_data['lu_weekday'] = app_data['lu_time'].dt.weekday
    app_data['lu_month'] = app_data['lu_time'].dt.month

    app_data['app_name'] = app_data['app_name'].apply(lambda x: x.lower())
    return app_data

In [8]:
class SmsUtils:
    STOP_WORDS = {
        'mx': ['han', 'al', 'contra', 'esto', 'tengas', 'esta', 'eres', 'tú', 'nuestra', 'estuvieron', 'sois', 'con',
               'tuvo', 'estuvieses', 'esté', 'fuésemos', 'tendrán', 'sentida', 'también', 'ese', 'estad', 'nosotras',
               'tenga', 'vuestra', 'tuyo', 'e', 'nosotros', 'sería', 'porque', 'tuviese', 'estamos', 'seamos',
               'nuestro', 'sin', 'tenían', 'eran', 'poco', 'será', 'vuestros', 'hubiera', 'suya', 'desde', 'nos',
               'quien', 'su', 'tenidas', 'serás', 'estuvimos', 'les', 'estoy', 'suyos', 'estuvierais', 'sentid',
               'estuvieran', 'qué', 'habiendo', 'muchos', 'por', 'habida', 'tenida', 'fui', 'nuestras', 'mío',
               'estaban', 'tenía', 'fuese', 'sean', 'éramos', 'otro', 'algunas', 'habrán', 'tenemos', 'tuvisteis',
               'habríamos', 'hubo', 'nada', 'tendríamos', 'hubieran', 'teníais', 'son', 'esos', 'estuve', 'tenidos',
               'ellos', 'o', 'habidos', 'seríais', 'seré', 'estés', 'estaría', 'habréis', 'somos', 'esa', 'tenías',
               'teníamos', 'otros', 'hube', 'pero', 'estuvieseis', 'estada', 'tenéis', 'soy', 'tuviésemos', 'fueras',
               'estar', 'tengamos', 'este', 'mí', 'estuviese', 'míos', 'tuviera', 'una', 'entre', 'tiene', 'mi',
               'hubierais', 'habíais', 'estarás', 'ti', 'estaríamos', 'un', 'hubiéramos', 'él', 'más', 'seríamos',
               'tendría', 'le', 'tuviéramos', 'tuviesen', 'estaré', 'ante', 'sintiendo', 'las', 'mucho', 'se', 'ni',
               'hay', 'fuesen', 'habíamos', 'estuvo', 'estabas', 'es', 'suyo', 'estaremos', 'estuviste', 'tuvieseis',
               'habrías', 'estarías', 'serán', 'hubiesen', 'tuviste', 'el', 'habré', 'estuviesen', 'tus', 'tuvieron',
               'estáis', 'muy', 'estará', 'habrá', 'habías', 'seáis', 'hayáis', 'fuéramos', 'hayamos', 'todos', 'algo',
               'otras', 'habría', 'hubieses', 'hemos', 'habríais', 'para', 'otra', 'tanto', 'en', 'había', 'habrían',
               'quienes', 'a', 'estadas', 'los', 'tendrías', 'como', 'te', 'estuvisteis', 'tendrás', 'hubieron', 'haya',
               'sobre', 'estén', 'durante', 'serías', 'está', 'vuestro', 'ella', 'teniendo', 'fueseis', 'todo', 'yo',
               'tendremos', 'estéis', 'hubiste', 'estado', 'hubieras', 'tenido', 'del', 'tuyos', 'estuviésemos',
               'habido', 'tuvieran', 'era', 'tuyas', 'de', 'hayas', 'la', 'están', 'mis', 'estarán', 'estaréis',
               'habidas', 'suyas', 'tengáis', 'y', 'vuestras', 'vosotras', 'uno', 'estaba', 'erais', 'fuimos', 'siente',
               'serían', 'no', 'hubisteis', 'fue', 'ya', 'esas', 'estaríais', 'tendréis', 'cual', 'hubieseis', 'habéis',
               'antes', 'eras', 'estados', 'tienen', 'tuya', 'estas', 'ellas', 'seas', 'seremos', 'tuve', 'sentidas',
               'eso', 'tendrían', 'algunos', 'sentidos', 'estás', 'hayan', 'tengo', 'sentido', 'fueran', 'hubiese',
               'fuerais', 'estuviéramos', 'fuera', 'tened', 'donde', 'lo', 'tienes', 'hubiésemos', 'sea', 'estábamos',
               'habrás', 'estarían', 'he', 'fuisteis', 'nuestros', 'has', 'tendríais', 'tendrá', 'fueron', 'hubimos',
               'habían', 'sí', 'ha', 'tuvieras', 'vosotros', 'tuvierais', 'estemos', 'tu', 'os', 'mías', 'que',
               'estuvieras', 'estando', 'seréis', 'estuviera', 'sus', 'fuiste', 'fueses', 'tendré', 'tuvimos',
               'estabais', 'mía', 'tuvieses', 'unos', 'habremos', 'estos', 'tengan', 'me', 'hasta', 'cuando']

    }


def normalize_phone(phone, country_id):
    # 去掉phone中的非数字和非字母
    def phone_normalize(phone):
        if len(phone) == 0 or phone is None:
            return 0, '', ''
        if phone[0] == '+':
            phone = '+' + phone.replace('+', '').lstrip('0')
            length = len(phone) - 1
        else:
            phone = phone.lstrip('0')
            length = len(phone)
        is_vaild, country_code, national_phone = 0, '', phone.lstrip('+')

        if length < 7:
            return str(national_phone)

        try:
            if phone.startswith('+'):
                parse_info = phonenumbers.parse(phone)
            else:
                parse_info = phonenumbers.parse(phone, country_id.upper())

            # print(phonenumbers.is_valid_number(parse_info))
            is_vaild = 1 if phonenumbers.is_valid_number(parse_info) else 0
            if is_vaild:
                # country_code = parse_info.country_code
                national_phone = parse_info.national_number
                return str(national_phone)
        except Exception:
            pass
        return str(national_phone)

    phone = str(phone)
    try:
        if len(phone) == 0 or phone is None:
            return ''
        if phone[0] == '+':
            phone = '+' + ''.join(filter(str.isalnum, phone[1:])).lower()
        else:
            phone = ''.join(filter(str.isalnum, phone)).lower()  # 去掉phone中的非数字和非字母
    except Exception as e:
        pass
    if any(c.isalpha() for c in phone):
        return phone.replace('+', '')
    return phone_normalize(phone)

def process_sms_data(user_sms: pd.DataFrame,
                     apply_time: pd.Timestamp,
                     country_id: str):
    """
    处理传入的sms数据，list of json
    Args:
        user_sms:用户短信数据
        apply_time:申请时间
        country_id:国家id
    """

    assert country_id in GenericConfigConstant.COUNTRY_ID, "country id not in list, Please input correct country id"
    if user_sms is None or user_sms.shape[0] == 0:
        return pd.DataFrame(columns=['body', 'phone', 'read', 'src_phone', 'time', 'type', 'time_day', 'hour', 'weekday', 'month', 'words', 'sender'])
        

    user_sms = user_sms[user_sms['time'].apply(lambda x: len(str(x)) in [10, 13, 34])]
    user_sms.loc[:, 'time'] = user_sms['time'].apply(lambda x: time_trans(x, country_id))
    if isinstance(apply_time, str):
        apply_time = pd.Timestamp(apply_time)
    user_sms = user_sms[user_sms['time'] < apply_time]

    # 每个人选取最近的3000条短信
    user_sms = user_sms.sort_values(by='time', ascending=False).head(3000)
    user_sms['time'] = pd.to_datetime(user_sms['time'], errors='coerce')
    user_sms.dropna(subset=['time'], inplace=True) 

    user_sms['time_day'] = user_sms['time'].dt.date
    user_sms['hour'] = user_sms['time'].dt.hour
    user_sms['weekday'] = user_sms['time'].dt.weekday
    user_sms['month'] = user_sms['time'].dt.month

    # 对'body'字段进行预处理
    user_sms['body'] = user_sms['body'].apply(lambda x: x.lower())
    user_sms['words'] = user_sms['body'].apply(lambda x: set(re.compile(r'\b[Ññáéíóúü¡A-Za-z]+\b').findall(x)))
    user_sms['words'] = user_sms['words'].apply(
        lambda x: [i for i in x if i not in SmsUtils.STOP_WORDS[country_id]])
    user_sms['words'] = user_sms['words'].apply(lambda x: [i for i in x if len(i) > 2])
    user_sms['words'] = user_sms['words'].apply(lambda x: list(map(str, x)))
    user_sms['sender'] = user_sms['phone'].apply(lambda x: normalize_phone(x, country_id))
    return user_sms


In [9]:
def parse_json_data(df, json_column, id_column, retain_column=None):
    """
    将df中的json字符串的摊平，并根据想要保留的字段，保留相关的值
    :param df:  待摊平处理的dataframe
    :param json_column: json字符串所在的那一列
    :param id_column: 唯一关联主键，后续用于定位转化失败的数据，或外部唯一关联的id
    :param retain_column: 待保留的列，可以是字符串，也可以是数组保存多列
    :return:
    """
    new_rows = []
    for i, row in df.iterrows():
        json_data_str = row[json_column]
        id_column_value = row[id_column]
        if isinstance(json_data_str, str):
            try:
                json_data = json.loads(json_data_str)
                for item in json_data:
                    new_row = item
                    new_row[id_column] = id_column_value
                    if isinstance(retain_column, str) and len(retain_column) > 0:
                        new_row[retain_column] = row[retain_column]
                    elif isinstance(retain_column, list) and len(retain_column) > 0:
                        for column in retain_column:
                            new_row[column] = row[column]
                    new_rows.append(new_row)
            except json.JSONDecodeError:
                print(f"Failed to parse JSON data for {id_column}:{id_column_value}")
                continue
    return pd.DataFrame(new_rows)

In [10]:
def data_of_dir(dir_path: str, contains_flags="", start_date='2023-01-01', end_date='2999-01-01'):
    """
    获取指定目录下的文件
    :param dir_path:  目录路径
    :param contains_flags: 包含的标志
    :param start_date:  开始日期
    :param end_date:  结束日期
    :return: 
    """
    def _fetch_filenams(dir_path: str, contain_flag, start_date=None, end_date=None):
        file_paths = []
        contain_flag = contain_flag or ""
        pattern = r"\d{4}-\d{2}-\d{2}"
        for file_name in os.listdir(dir_path):
            if (contain_flag in file_name) and (
            file_name.endswith(('.pqt', '.parquet', '.csv', '.xlsx', '.pickle', '.pkl'))):
                if start_date is None:
                    file_paths.append(os.path.join(dir_path, file_name))
                else:
                    match = re.search(pattern, file_name)
                    date = match.group()  # type: ignore
                    if (date >= start_date) and (date < end_date):
                        file_paths.append(os.path.join(dir_path, file_name))
        file_paths.sort()
        return file_paths

    if isinstance(contains_flags, str) or contains_flags is None:
        return _fetch_filenams(dir_path, contains_flags, start_date, end_date)
    elif isinstance(contains_flags, list):
        file_names = None
        for contains_flag in contains_flags:  # type: ignore
            if file_names is None:
                file_names = _fetch_filenams(dir_path, contains_flag, start_date, end_date)
            else:
                file_names = file_names + _fetch_filenams(dir_path, contains_flag, start_date, end_date)
        return file_names

In [11]:
def parallel_process(task_function,task_list,process_num=10):
    """
    task_function : 任务函数
    task_list : 任务列表
    """
    start_time = time.time()
    def _task(task_function,queue,task_params):
        try:
            task_function(*task_params)
        finally:
            queue.put(True)
    from multiprocessing import Queue, Process
    mp_queue = Queue()
    for i in range(process_num):
        mp_queue.put(True)
    for task_params in tqdm(task_list):
        mp_queue.get()
        Process(target=_task,args=(task_function,mp_queue,task_params)).start() # mq_queue 用于控制进程数
    for i in range(process_num):
        mp_queue.get()
    mp_queue.close()
    gc.collect()
    print(f'任务完成，总计{len(task_list)}个任务,耗时{time.time()-start_time}s')

## sender 计算函数

In [12]:
def calc_sender_features(row, config_after_window):
    """
    计算短信发送者相关特征 按行
    :param row:  行数据
    :param config_after_window:  配置数据
    :return: 
    """
    sms_data = row['sms_data']
    apply_time = pd.to_datetime(row['apply_time'])
    country_id = 'mx'
    feature_dict = {}
    if sms_data is None or sms_data == '' or sms_data == '[]' or sms_data == 'null':
        return pd.concat([row, pd.Series(SenderOverdueRateV1._if_df_empty())], axis=0)
    sms_data = json.loads(sms_data)
    user_sms = pd.DataFrame(sms_data)
    try:
        user_sms = process_sms_data(user_sms, apply_time, country_id)
        if user_sms.shape[0] == 0:
            feature_dict = SenderOverdueRateV1._if_df_empty()
        else:
            feature_dict.update(SenderOverdueRateV1.extract_sender_rlevel_cnt_features(user_sms, 'time', config_after_window, apply_time))
            feature_dict.update(SenderOverdueRateV1.extract_sender_rlevel_time_features(user_sms, 'time', config_after_window, apply_time))
            feature_dict.update(SenderOverdueRateV1.extract_sender_rlevel_continuous_day_features(user_sms, 'time', config_after_window, apply_time))
            feature_dict.update(SenderOverdueRateV1.extract_sender_rlevel_shift_diff_features(user_sms, 'time', config_after_window, apply_time))
            feature_dict.update(SenderOverdueRateV1.extract_sender_rlevel_time_period_features(user_sms, 'time', config_after_window, apply_time))
        row = pd.concat([row, pd.Series(feature_dict)], axis=0)
    except Exception as e:
        print(f"Error in calc_sender_features: {traceback.format_exc()}")
        print('app_order_id:', row['app_order_id'])
    return row 
def computer_sender_features_daily(file_path:str, save_path:str):
    """
    计算短信发送者相关特征
    :param file_path:  格式：raw_AM_2024-05-05.pqt
    :param save_path: 
    :return: 
    """
    try:
        raw_df = pd.read_parquet(file_path)
        month = file_path.split('_')[-1].split('.')[0][:7]
        config_path = work_dir + 'sender_config_window' + f'sender_overdue_rate_window3_{month}.parquet'
        config_after_window = pd.read_parquet(config_path)
        raw_df = raw_df.apply(lambda x: calc_sender_features(x, config_after_window), axis=1)
    except Exception as e:
        print(f"Error in computer_sender_features_daily: {traceback.format_exc()}")
    raw_df.to_parquet(save_path)

## word 计算函数

In [12]:
def calc_word_features(row, config_after_window):
    """
    计算word相关特征 按行
    :param row:  行数据
    :param config_after_window:  配置数据
    :return: 
    """
    sms_data = row['sms_data']
    apply_time = pd.to_datetime(row['apply_time'])
    country_id = 'mx'
    feature_dict = {}
    if sms_data is None or sms_data == '' or sms_data == '[]' or sms_data == 'null':
        return pd.concat([row, pd.Series(WordOverdueRateV1._is_df_empty())], axis=0)
    sms_data = json.loads(sms_data)
    user_sms = pd.DataFrame(sms_data)
    try:
        user_sms = process_sms_data(user_sms, apply_time, country_id)
        if user_sms.shape[0] == 0:
            feature_dict = WordOverdueRateV1._is_df_empty()
        else:
            feature_dict.update(WordOverdueRateV1.extract_word_rlevel_cnt_features(user_sms, 'time', config_after_window, apply_time))
            feature_dict.update(WordOverdueRateV1.extract_word_rlevel_time_features(user_sms, 'time', config_after_window, apply_time))
            feature_dict.update(WordOverdueRateV1.extract_word_rlevel_continuous_day_features(user_sms, 'time', config_after_window, apply_time))
            feature_dict.update(WordOverdueRateV1.extract_word_rlevel_shift_diff_features(user_sms, 'time', config_after_window, apply_time))
            feature_dict.update(WordOverdueRateV1.extract_word_rlevel_time_period_features(user_sms, 'time', config_after_window, apply_time))
        row = pd.concat([row, pd.Series(feature_dict)], axis=0)
    except Exception as e:
        print(f"Error in calc_word_features: {traceback.format_exc()}")
        print('app_order_id:', row['app_order_id'])
    return row

def computer_word_features_daily(file_path:str, save_path:str):
    """
    计算word相关特征
    :param file_path:  格式：raw_AM_2024-05-05.pqt
    :param save_path: 
    :return: 
    """
    try:
        raw_df = pd.read_parquet(file_path)
        month = file_path.split('_')[-1].split('.')[0][:7]
        config_path = work_dir + 'word_config_window' + f'word_overdue_rate_window3_{month}.parquet'
        config_after_window = pd.read_parquet(config_path)
        raw_df = raw_df.apply(lambda x: calc_word_features(x, config_after_window), axis=1)
    except Exception as e:
        print(f"Error in computer_word_features_daily: {traceback.format_exc()}")
    raw_df.to_parquet(save_path)

In [13]:
def calc_sms_features(row,sender_config_after_window,word_config_after_window):
    """
    计算短信相关特征 按行
    :param row:  行数据
    :param sender_config_after_window:  短信发送者配置数据
    :param word_config_after_window:  短信内容配置数据
    :return: 
    """
    sms_data = row['sms_data']
    apply_time = pd.to_datetime(row['apply_time'])
    country_id = 'mx'
    feature_dict = {}
    if sms_data is None or sms_data == '' or sms_data == '[]' or sms_data == 'null':
        return pd.concat([row, pd.Series(SenderOverdueRateV1._if_df_empty()), pd.Series(WordOverdueRateV1._is_df_empty())], axis=0)
    sms_data = json.loads(sms_data)
    user_sms = pd.DataFrame(sms_data)
    try:
        user_sms = process_sms_data(user_sms, apply_time, country_id)
        if user_sms.shape[0] == 0:
            feature_dict.update(SenderOverdueRateV1._if_df_empty())
            feature_dict.update(WordOverdueRateV1._is_df_empty())
        else:
            feature_dict.update(SenderOverdueRateV1.extract_sender_rlevel_cnt_features(user_sms, 'time', sender_config_after_window, apply_time))
            feature_dict.update(SenderOverdueRateV1.extract_sender_rlevel_time_features(user_sms, 'time', sender_config_after_window, apply_time))
            feature_dict.update(SenderOverdueRateV1.extract_sender_rlevel_continuous_day_features(user_sms, 'time', sender_config_after_window, apply_time))
            feature_dict.update(SenderOverdueRateV1.extract_sender_rlevel_shift_diff_features(user_sms, 'time', sender_config_after_window, apply_time))
            feature_dict.update(SenderOverdueRateV1.extract_sender_rlevel_time_period_features(user_sms, 'time', sender_config_after_window, apply_time))
            feature_dict.update(WordOverdueRateV1.extract_word_rlevel_cnt_features(user_sms, 'time', word_config_after_window, apply_time))
            feature_dict.update(WordOverdueRateV1.extract_word_rlevel_time_features(user_sms, 'time', word_config_after_window, apply_time))
            feature_dict.update(WordOverdueRateV1.extract_word_rlevel_continuous_day_features(user_sms, 'time', word_config_after_window, apply_time))
            feature_dict.update(WordOverdueRateV1.extract_word_rlevel_shift_diff_features(user_sms, 'time', word_config_after_window, apply_time))
            feature_dict.update(WordOverdueRateV1.extract_word_rlevel_time_period_features(user_sms, 'time', word_config_after_window, apply_time))
        row = pd.concat([row, pd.Series(feature_dict)], axis=0)
    except Exception as e:
        print(f"Error in calc_sms_features: {traceback.format_exc()}")
        print('app_order_id:', row['app_order_id'])
    return row
        

## app 计算函数

In [14]:
def calc_app_features(row, config_after_window):
    """
    计算app相关特征 按行
    :param row:  行数据
    :param config_after_window:  配置数据
    :return: 
    """
    app_data = row['applist_data']
    apply_time = pd.to_datetime(row['apply_time'])
    country_id = 'mx'
    feature_dict = {}
    if app_data is None or app_data == '' or app_data == '[]' or app_data == 'null':
        return pd.concat([row, pd.Series(AppOverdueRateV1._if_df_empty())], axis=0)
    app_data = json.loads(app_data)
    user_app = pd.DataFrame(app_data)
    try:
        user_app = process_app_data(user_app, apply_time, country_id)
        if user_app.shape[0] == 0:
            feature_dict.update(AppOverdueRateV1._if_df_empty(user_app))
        else:
            feature_dict.update(AppOverdueRateV1.extract_app_rlevel_cnt_features(user_app, 'fi_time', config_after_window, apply_time))
            feature_dict.update(AppOverdueRateV1.extract_app_rlevel_time_features(user_app, 'fi_time', config_after_window, apply_time))
            feature_dict.update(AppOverdueRateV1.extract_app_rlevel_continuous_day_features(user_app, 'fi_time', config_after_window, apply_time))
            feature_dict.update(AppOverdueRateV1.extract_app_rlevel_shift_diff_features(user_app, 'fi_time', config_after_window, apply_time))
            feature_dict.update(AppOverdueRateV1.extract_app_rlevel_time_period_features(user_app, 'fi_time', config_after_window, apply_time))
        row = pd.concat([row, pd.Series(feature_dict)], axis=0)
    except Exception as e:
        print(f"Error in calc_app_features: {traceback.format_exc()}")
        print('app_order_id:', row['app_order_id'])
    return row 

def computer_app_features_daily(file_path:str, save_path:str):
    """
    计算app相关特征
    :param file_path: 
    :param save_path: 
    :return: 
    """
    try:
        raw_df = pd.read_parquet(file_path)
        month = file_path.split('_')[-1].split('.')[0][:7]
        config_path = work_dir + 'app_config_window' + f'app_overdue_rate_window3_{month}.parquet'
        config_after_window = pd.read_parquet(config_path)
        raw_df = raw_df.apply(lambda x: calc_app_features(x, config_after_window), axis=1)
    except Exception as e:
        print(f"Error in computer_app_features_daily: {traceback.format_exc()}")
    raw_df.to_parquet(save_path)


In [15]:
def computer_features_daily(file_path:str, save_path:str):
    """
    计算特征
    :param file_path: 
    :param save_path: 
    :return: 
    """
    try:
        # 如果save_path已经存在，查看shape1是否为4，如果是，则计算
        raw_df = pd.read_parquet(file_path,columns=['app_order_id','apply_time', 'sms_data','applist_data'])
        month = str(file_path).split('_')[-1].split('.')[0][:7]
        sender_config_path = work_dir / 'sender_config_window' / f'sender_overdue_rate_window3_{month}.parquet'
        sender_config_after_window = pd.read_parquet(sender_config_path)
        word_config_path = work_dir / 'word_config_window' / f'word_overdue_rate_window3_{month}.parquet'
        word_config_after_window = pd.read_parquet(word_config_path)
        app_config_path = work_dir / 'app_config_window' / f'app_overdue_rate_window3_{month}.parquet'
        app_config_after_window = pd.read_parquet(app_config_path)
        
        raw_df = raw_df.apply(lambda x: calc_sms_features(x, sender_config_after_window, word_config_after_window), axis=1)
        raw_df = raw_df.apply(lambda x: calc_app_features(x, app_config_after_window), axis=1)
    except Exception as e:
        print(file_path)
        print(f"Error in computer_features_daily: {traceback.format_exc()}")
    print(f'{save_path} : {raw_df.shape}')
    raw_df.to_parquet(save_path)

In [None]:
res_dir = '/home/risk_share_dir/mx_newcust_fea_new'
if not os.path.exists(res_dir):
    os.makedirs(res_dir)
file_paths = data_of_dir(raw_dir, '_AM_')
task_list =  [ (x,  str(res_dir + '/' + x.split('/')[-1])) for x in file_paths]
print(task_list[0])
parallel_process(computer_features_daily,task_list,process_num=15)

('/home/longxiaolei/raw_mx/newcust/raw_AM_2023-11-08.pqt', '/home/risk_share_dir/mx_newcust_fea_new/raw_AM_2023-11-08.pqt')


  5%|▍         | 13/286 [00:00<00:02, 118.09it/s]

/home/risk_share_dir/mx_newcust_fea_new/raw_AM_2023-11-08.pqt : (1, 11924)


  5%|▍         | 13/286 [00:14<00:02, 118.09it/s]

In [37]:
df1 = pd.read_parquet('/home/risk_share_dir/mx_newcust_fea/raw_AM_2024-08-04.pqt')
df2 = pd.read_parquet('/home/risk_share_dir/mx_newcust_fea/raw_AM_2024-05-04.pqt')
cols_in_df1_not_in_df2 = df1.columns[~df1.columns.isin(df2.columns)]
cols_in_df2_not_in_df1 = df2.columns[~df2.columns.isin(df1.columns)]  # df2中有，df1中没有的列

In [38]:
cols_in_df1_not_in_df2

Index([], dtype='object')

In [39]:
cols_in_df2_not_in_df1

Index(['app_dist_rlevel9_d1_cnt', 'app_dist_rlevel9_d1_ratio',
       'app_dist_rlevel9_d3_cnt', 'app_dist_rlevel9_d3_ratio',
       'app_dist_rlevel9_d7_cnt', 'app_dist_rlevel9_d7_ratio',
       'app_dist_rlevel9_d15_cnt', 'app_dist_rlevel9_d15_ratio',
       'app_dist_rlevel9_d30_cnt', 'app_dist_rlevel9_d30_ratio',
       ...
       'app_dist_rlevel9_early_morning_dall',
       'app_dist_rlevel9_early_morning_dall_ratio',
       'app_dist_rlevel9_morning_dall', 'app_dist_rlevel9_morning_dall_ratio',
       'app_dist_rlevel9_noon_dall', 'app_dist_rlevel9_noon_dall_ratio',
       'app_dist_rlevel9_afternoon_dall',
       'app_dist_rlevel9_afternoon_dall_ratio', 'app_dist_rlevel9_night_dall',
       'app_dist_rlevel9_night_dall_ratio'],
      dtype='object', length=182)

In [None]:
def check_file(file_path):
    """
    检查文件,如果文件的shape[1]不为11924，则返回文件路径
    :param file_path: 
    :return: 
    """
    check = pd.read_parquet(file_path)
    if check.shape[1] != 11924:
        print(f"Error in check_file: {file_path}")
        

In [None]:
res_dir = '/home/risk_share_dir/mx_newcust_fea'
file_paths = data_of_dir(raw_dir, '_AM_')
task_list =  [str(res_dir + '/' + x.split('/')[-1]) for x in file_paths]
parallel_process(check_file,task_list,process_num=15)

# check result

In [19]:
from mypackage.common_tools import *

In [24]:
files = os.listdir('/home/risk_share_dir/mx_newcust_fea')
files = [os.path.join('/home/risk_share_dir/mx_newcust_fea', x) for x in files]

In [23]:
files[0]

'raw_AM_2023-11-08.pqt'

In [10]:
test = pd.read_parquet('/home/risk_share_dir/mx_newcust_fea/raw_AM_2023-12-08.pqt')


In [11]:
test.shape

(48, 11600)

In [13]:
test['app_order_id'].unique()

array([1182839320705683456, 1182722140638990336, 1182720020758056960,
       1182683912556957696, 1182765473843302400, 1182767557523197952,
       1182816324129574912, 1182790489704157184, 1182697715373887488,
       1182564039243780096, 1182691396554248192, 1182704767471112192,
       1182710913334431744, 1182720169215447040, 1182720542844047360,
       1182723700655837184, 1182730623480418304, 1182734063849885696,
       1182756616899317760, 1182758305584828416, 1182758718354673664,
       1182760327386157056, 1182761793198936064, 1182800533460013056,
       1182778783829512192, 1182779298634190848, 1182787534607273984,
       1182789638545657856, 1182789592345399296, 1182790106223136768,
       1182793421103222784, 1182796936970162176, 1182859001143414784,
       1182807274272772096, 1182815267873804288, 1182817087501291520,
       1182817627723452416, 1182819504175370240, 1182827094716370944,
       1182842445420220416, 1182846149980508160, 1182848468927303680,
       1182856719987

In [25]:
df_res = batch_load_files_to_df(files)
df_res.shape

(66057, 12042)

In [29]:
# test 的columns 不在 df_res中 的
cols_in_test_not_in_df_res = test.columns[~test.columns.isin(df_res.columns)]
cols_in_df_res_not_in_test = df_res.columns[~df_res.columns.isin(test.columns)]
for col in cols_in_test_not_in_df_res:
    print(col)

for col in cols_in_df_res_not_in_test:
    print(col)


sender_dist_rlevel10_dall_shift_diff
sender_dist_rlevel10_dall_shift_ratio
sender_dist_rlevel1_dall_shift_diff
sender_dist_rlevel1_dall_shift_ratio
sender_dist_rlevel2_dall_shift_diff
sender_dist_rlevel2_dall_shift_ratio
sender_dist_rlevel3_dall_shift_diff
sender_dist_rlevel3_dall_shift_ratio
sender_dist_rlevel4_dall_shift_diff
sender_dist_rlevel4_dall_shift_ratio
sender_dist_rlevel5_dall_shift_diff
sender_dist_rlevel5_dall_shift_ratio
sender_dist_rlevel6_dall_shift_diff
sender_dist_rlevel6_dall_shift_ratio
sender_dist_rlevel7_dall_shift_diff
sender_dist_rlevel7_dall_shift_ratio
sender_dist_rlevel9_dall_shift_diff
sender_dist_rlevel9_dall_shift_ratio
sender_freq_rlevel10_dall_shift_diff
sender_freq_rlevel10_dall_shift_ratio
sender_freq_rlevel1_dall_shift_diff
sender_freq_rlevel1_dall_shift_ratio
sender_freq_rlevel2_dall_shift_diff
sender_freq_rlevel2_dall_shift_ratio
sender_freq_rlevel3_dall_shift_diff
sender_freq_rlevel3_dall_shift_ratio
sender_freq_rlevel4_dall_shift_diff
sender_freq

In [5]:
df_res['word_freq_rlevel6_dall_shift_ratio'].unique()

NameError: name 'df_res' is not defined

In [30]:
cols_in_df_res_not_in_test.tolist()

['sender_dist_rlevel10_dall_shift_diff',
 'sender_dist_rlevel10_dall_shift_ratio',
 'sender_dist_rlevel1_dall_shift_diff',
 'sender_dist_rlevel1_dall_shift_ratio',
 'sender_dist_rlevel2_dall_shift_diff',
 'sender_dist_rlevel2_dall_shift_ratio',
 'sender_dist_rlevel3_dall_shift_diff',
 'sender_dist_rlevel3_dall_shift_ratio',
 'sender_dist_rlevel4_dall_shift_diff',
 'sender_dist_rlevel4_dall_shift_ratio',
 'sender_dist_rlevel5_dall_shift_diff',
 'sender_dist_rlevel5_dall_shift_ratio',
 'sender_dist_rlevel6_dall_shift_diff',
 'sender_dist_rlevel6_dall_shift_ratio',
 'sender_dist_rlevel7_dall_shift_diff',
 'sender_dist_rlevel7_dall_shift_ratio',
 'sender_dist_rlevel9_dall_shift_diff',
 'sender_dist_rlevel9_dall_shift_ratio',
 'sender_freq_rlevel10_dall_shift_diff',
 'sender_freq_rlevel10_dall_shift_ratio',
 'sender_freq_rlevel1_dall_shift_diff',
 'sender_freq_rlevel1_dall_shift_ratio',
 'sender_freq_rlevel2_dall_shift_diff',
 'sender_freq_rlevel2_dall_shift_ratio',
 'sender_freq_rlevel3_da

In [34]:
def rm_dall_columns(file_path):
    """
    删除文件的指定列
    :param file_path: 
    :return: 
    """
    cols_to_drop = ['sender_dist_rlevel10_dall_shift_diff',
 'sender_dist_rlevel10_dall_shift_ratio',
 'sender_dist_rlevel1_dall_shift_diff',
 'sender_dist_rlevel1_dall_shift_ratio',
 'sender_dist_rlevel2_dall_shift_diff',
 'sender_dist_rlevel2_dall_shift_ratio',
 'sender_dist_rlevel3_dall_shift_diff',
 'sender_dist_rlevel3_dall_shift_ratio',
 'sender_dist_rlevel4_dall_shift_diff',
 'sender_dist_rlevel4_dall_shift_ratio',
 'sender_dist_rlevel5_dall_shift_diff',
 'sender_dist_rlevel5_dall_shift_ratio',
 'sender_dist_rlevel6_dall_shift_diff',
 'sender_dist_rlevel6_dall_shift_ratio',
 'sender_dist_rlevel7_dall_shift_diff',
 'sender_dist_rlevel7_dall_shift_ratio',
 'sender_dist_rlevel9_dall_shift_diff',
 'sender_dist_rlevel9_dall_shift_ratio',
 'sender_freq_rlevel10_dall_shift_diff',
 'sender_freq_rlevel10_dall_shift_ratio',
 'sender_freq_rlevel1_dall_shift_diff',
 'sender_freq_rlevel1_dall_shift_ratio',
 'sender_freq_rlevel2_dall_shift_diff',
 'sender_freq_rlevel2_dall_shift_ratio',
 'sender_freq_rlevel3_dall_shift_diff',
 'sender_freq_rlevel3_dall_shift_ratio',
 'sender_freq_rlevel4_dall_shift_diff',
 'sender_freq_rlevel4_dall_shift_ratio',
 'sender_freq_rlevel5_dall_shift_diff',
 'sender_freq_rlevel5_dall_shift_ratio',
 'sender_freq_rlevel6_dall_shift_diff',
 'sender_freq_rlevel6_dall_shift_ratio',
 'sender_freq_rlevel7_dall_shift_diff',
 'sender_freq_rlevel7_dall_shift_ratio',
 'sender_freq_rlevel8_dall_shift_diff',
 'sender_freq_rlevel8_dall_shift_ratio',
 'sender_freq_rlevel9_dall_shift_diff',
 'sender_freq_rlevel9_dall_shift_ratio',
 'word_dist_rlevel10_dall_shift_diff',
 'word_dist_rlevel10_dall_shift_ratio',
 'word_dist_rlevel1_dall_shift_diff',
 'word_dist_rlevel1_dall_shift_ratio',
 'word_dist_rlevel2_dall_shift_diff',
 'word_dist_rlevel2_dall_shift_ratio',
 'word_dist_rlevel3_dall_shift_diff',
 'word_dist_rlevel3_dall_shift_ratio',
 'word_dist_rlevel4_dall_shift_diff',
 'word_dist_rlevel4_dall_shift_ratio',
 'word_dist_rlevel5_dall_shift_diff',
 'word_dist_rlevel5_dall_shift_ratio',
 'word_dist_rlevel6_dall_shift_diff',
 'word_dist_rlevel6_dall_shift_ratio',
 'word_dist_rlevel7_dall_shift_diff',
 'word_dist_rlevel7_dall_shift_ratio',
 'word_dist_rlevel8_dall_shift_diff',
 'word_dist_rlevel8_dall_shift_ratio',
 'word_dist_rlevel9_dall_shift_diff',
 'word_dist_rlevel9_dall_shift_ratio',
 'word_freq_rlevel10_dall_shift_diff',
 'word_freq_rlevel10_dall_shift_ratio',
 'word_freq_rlevel1_dall_shift_diff',
 'word_freq_rlevel1_dall_shift_ratio',
 'word_freq_rlevel2_dall_shift_diff',
 'word_freq_rlevel2_dall_shift_ratio',
 'word_freq_rlevel3_dall_shift_diff',
 'word_freq_rlevel3_dall_shift_ratio',
 'word_freq_rlevel4_dall_shift_diff',
 'word_freq_rlevel4_dall_shift_ratio',
 'word_freq_rlevel5_dall_shift_diff',
 'word_freq_rlevel5_dall_shift_ratio',
 'word_freq_rlevel6_dall_shift_diff',
 'word_freq_rlevel6_dall_shift_ratio',
 'word_freq_rlevel7_dall_shift_diff',
 'word_freq_rlevel7_dall_shift_ratio',
 'word_freq_rlevel8_dall_shift_diff',
 'word_freq_rlevel8_dall_shift_ratio',
 'word_freq_rlevel9_dall_shift_diff',
 'word_freq_rlevel9_dall_shift_ratio',
 'app_dist_rlevel10_dall_shift_diff',
 'app_dist_rlevel10_dall_shift_ratio',
 'app_dist_rlevel1_dall_shift_diff',
 'app_dist_rlevel1_dall_shift_ratio',
 'app_dist_rlevel2_dall_shift_diff',
 'app_dist_rlevel2_dall_shift_ratio',
 'app_dist_rlevel3_dall_shift_diff',
 'app_dist_rlevel3_dall_shift_ratio',
 'app_dist_rlevel4_dall_shift_diff',
 'app_dist_rlevel4_dall_shift_ratio',
 'app_dist_rlevel5_dall_shift_diff',
 'app_dist_rlevel5_dall_shift_ratio',
 'app_dist_rlevel6_dall_shift_diff',
 'app_dist_rlevel6_dall_shift_ratio',
 'app_dist_rlevel7_dall_shift_diff',
 'app_dist_rlevel7_dall_shift_ratio',
 'app_dist_rlevel8_dall_shift_diff',
 'app_dist_rlevel8_dall_shift_ratio',
 'app_dist_rlevel9_dall_shift_diff',
 'app_dist_rlevel9_dall_shift_ratio',
 'app_freq_rlevel10_dall_shift_diff',
 'app_freq_rlevel10_dall_shift_ratio',
 'app_freq_rlevel1_dall_shift_diff',
 'app_freq_rlevel1_dall_shift_ratio',
 'app_freq_rlevel2_dall_shift_diff',
 'app_freq_rlevel2_dall_shift_ratio',
 'app_freq_rlevel3_dall_shift_diff',
 'app_freq_rlevel3_dall_shift_ratio',
 'app_freq_rlevel4_dall_shift_diff',
 'app_freq_rlevel4_dall_shift_ratio',
 'app_freq_rlevel5_dall_shift_diff',
 'app_freq_rlevel5_dall_shift_ratio',
 'app_freq_rlevel6_dall_shift_diff',
 'app_freq_rlevel6_dall_shift_ratio',
 'app_freq_rlevel7_dall_shift_diff',
 'app_freq_rlevel7_dall_shift_ratio',
 'app_freq_rlevel8_dall_shift_diff',
 'app_freq_rlevel8_dall_shift_ratio',
 'app_freq_rlevel9_dall_shift_diff',
 'app_freq_rlevel9_dall_shift_ratio']
    df = pd.read_parquet(file_path)
    df.drop(columns=cols_to_drop,inplace=True)
    print(df.shape)
    # df.to_parquet(file_path)

In [31]:
res_dir = '/home/risk_share_dir/mx_newcust_fea'
file_paths = data_of_dir(res_dir, '_AM_')
file_paths

['/home/risk_share_dir/mx_newcust_fea/raw_AM_2023-11-08.pqt',
 '/home/risk_share_dir/mx_newcust_fea/raw_AM_2023-11-09.pqt',
 '/home/risk_share_dir/mx_newcust_fea/raw_AM_2023-11-10.pqt',
 '/home/risk_share_dir/mx_newcust_fea/raw_AM_2023-11-11.pqt',
 '/home/risk_share_dir/mx_newcust_fea/raw_AM_2023-11-12.pqt',
 '/home/risk_share_dir/mx_newcust_fea/raw_AM_2023-11-13.pqt',
 '/home/risk_share_dir/mx_newcust_fea/raw_AM_2023-11-14.pqt',
 '/home/risk_share_dir/mx_newcust_fea/raw_AM_2023-11-15.pqt',
 '/home/risk_share_dir/mx_newcust_fea/raw_AM_2023-11-16.pqt',
 '/home/risk_share_dir/mx_newcust_fea/raw_AM_2023-11-17.pqt',
 '/home/risk_share_dir/mx_newcust_fea/raw_AM_2023-11-18.pqt',
 '/home/risk_share_dir/mx_newcust_fea/raw_AM_2023-11-19.pqt',
 '/home/risk_share_dir/mx_newcust_fea/raw_AM_2023-11-20.pqt',
 '/home/risk_share_dir/mx_newcust_fea/raw_AM_2023-11-21.pqt',
 '/home/risk_share_dir/mx_newcust_fea/raw_AM_2023-11-22.pqt',
 '/home/risk_share_dir/mx_newcust_fea/raw_AM_2023-11-23.pqt',
 '/home/

In [15]:
test_drop = pd.read_parquet('/home/risk_share_dir/mx_newcust_fea/raw_AM_2024-08-18.pqt')

In [17]:
test_drop.shape

(452, 11398)

In [18]:
test_drop['app_order_id'].unique().tolist()

[1274784971164667904,
 1274873177373593600,
 1274876937042841600,
 1274796429923504128,
 1274905776783544320,
 1274900982266253312,
 1274884013685133312,
 1274769768809156608,
 1274921730280181760,
 1274786206408167424,
 1274794255738916864,
 1274801761248792576,
 1274885227478962176,
 1274811074675658752,
 1274874554384568320,
 1274905858106904576,
 1274884797856403456,
 1274892772675342336,
 1274692250647318528,
 1274925433087250432,
 1274909200539373568,
 1274916373235421184,
 1274945581600919552,
 1274894771961647104,
 1274872506427559936,
 1274691414609285120,
 1274879494238687232,
 1274859206365073408,
 1274701167938949120,
 1274704919550939136,
 1274632340186255360,
 1274891422004596736,
 1274860496377794560,
 1274830630538366976,
 1274871992377856000,
 1274900812703125504,
 1274625478103584768,
 1274922158921273344,
 1274782548173967360,
 1274644546374258688,
 1274641573237673984,
 1274783078484987904,
 1274783857191079936,
 1274648929950420992,
 1274934152390955008,
 127481766

In [33]:
test_drop.shape

(452, 11398)

In [35]:
rm_dall_columns('/home/risk_share_dir/mx_newcust_fea/raw_AM_2024-08-18.pqt')

KeyError: "['sender_dist_rlevel10_dall_shift_diff', 'sender_dist_rlevel10_dall_shift_ratio', 'sender_dist_rlevel1_dall_shift_diff', 'sender_dist_rlevel1_dall_shift_ratio', 'sender_dist_rlevel2_dall_shift_diff', 'sender_dist_rlevel2_dall_shift_ratio', 'sender_dist_rlevel3_dall_shift_diff', 'sender_dist_rlevel3_dall_shift_ratio', 'sender_dist_rlevel4_dall_shift_diff', 'sender_dist_rlevel4_dall_shift_ratio', 'sender_dist_rlevel5_dall_shift_diff', 'sender_dist_rlevel5_dall_shift_ratio', 'sender_dist_rlevel6_dall_shift_diff', 'sender_dist_rlevel6_dall_shift_ratio', 'sender_dist_rlevel7_dall_shift_diff', 'sender_dist_rlevel7_dall_shift_ratio', 'sender_dist_rlevel9_dall_shift_diff', 'sender_dist_rlevel9_dall_shift_ratio', 'sender_freq_rlevel10_dall_shift_diff', 'sender_freq_rlevel10_dall_shift_ratio', 'sender_freq_rlevel1_dall_shift_diff', 'sender_freq_rlevel1_dall_shift_ratio', 'sender_freq_rlevel2_dall_shift_diff', 'sender_freq_rlevel2_dall_shift_ratio', 'sender_freq_rlevel3_dall_shift_diff', 'sender_freq_rlevel3_dall_shift_ratio', 'sender_freq_rlevel4_dall_shift_diff', 'sender_freq_rlevel4_dall_shift_ratio', 'sender_freq_rlevel5_dall_shift_diff', 'sender_freq_rlevel5_dall_shift_ratio', 'sender_freq_rlevel6_dall_shift_diff', 'sender_freq_rlevel6_dall_shift_ratio', 'sender_freq_rlevel7_dall_shift_diff', 'sender_freq_rlevel7_dall_shift_ratio', 'sender_freq_rlevel8_dall_shift_diff', 'sender_freq_rlevel8_dall_shift_ratio', 'sender_freq_rlevel9_dall_shift_diff', 'sender_freq_rlevel9_dall_shift_ratio', 'word_dist_rlevel10_dall_shift_diff', 'word_dist_rlevel10_dall_shift_ratio', 'word_dist_rlevel1_dall_shift_diff', 'word_dist_rlevel1_dall_shift_ratio', 'word_dist_rlevel2_dall_shift_diff', 'word_dist_rlevel2_dall_shift_ratio', 'word_dist_rlevel3_dall_shift_diff', 'word_dist_rlevel3_dall_shift_ratio', 'word_dist_rlevel4_dall_shift_diff', 'word_dist_rlevel4_dall_shift_ratio', 'word_dist_rlevel5_dall_shift_diff', 'word_dist_rlevel5_dall_shift_ratio', 'word_dist_rlevel6_dall_shift_diff', 'word_dist_rlevel6_dall_shift_ratio', 'word_dist_rlevel7_dall_shift_diff', 'word_dist_rlevel7_dall_shift_ratio', 'word_dist_rlevel8_dall_shift_diff', 'word_dist_rlevel8_dall_shift_ratio', 'word_dist_rlevel9_dall_shift_diff', 'word_dist_rlevel9_dall_shift_ratio', 'word_freq_rlevel10_dall_shift_diff', 'word_freq_rlevel10_dall_shift_ratio', 'word_freq_rlevel1_dall_shift_diff', 'word_freq_rlevel1_dall_shift_ratio', 'word_freq_rlevel2_dall_shift_diff', 'word_freq_rlevel2_dall_shift_ratio', 'word_freq_rlevel3_dall_shift_diff', 'word_freq_rlevel3_dall_shift_ratio', 'word_freq_rlevel4_dall_shift_diff', 'word_freq_rlevel4_dall_shift_ratio', 'word_freq_rlevel5_dall_shift_diff', 'word_freq_rlevel5_dall_shift_ratio', 'word_freq_rlevel6_dall_shift_diff', 'word_freq_rlevel6_dall_shift_ratio', 'word_freq_rlevel7_dall_shift_diff', 'word_freq_rlevel7_dall_shift_ratio', 'word_freq_rlevel8_dall_shift_diff', 'word_freq_rlevel8_dall_shift_ratio', 'word_freq_rlevel9_dall_shift_diff', 'word_freq_rlevel9_dall_shift_ratio', 'app_dist_rlevel10_dall_shift_diff', 'app_dist_rlevel10_dall_shift_ratio', 'app_dist_rlevel1_dall_shift_diff', 'app_dist_rlevel1_dall_shift_ratio', 'app_dist_rlevel2_dall_shift_diff', 'app_dist_rlevel2_dall_shift_ratio', 'app_dist_rlevel3_dall_shift_diff', 'app_dist_rlevel3_dall_shift_ratio', 'app_dist_rlevel4_dall_shift_diff', 'app_dist_rlevel4_dall_shift_ratio', 'app_dist_rlevel5_dall_shift_diff', 'app_dist_rlevel5_dall_shift_ratio', 'app_dist_rlevel6_dall_shift_diff', 'app_dist_rlevel6_dall_shift_ratio', 'app_dist_rlevel7_dall_shift_diff', 'app_dist_rlevel7_dall_shift_ratio', 'app_dist_rlevel8_dall_shift_diff', 'app_dist_rlevel8_dall_shift_ratio', 'app_dist_rlevel9_dall_shift_diff', 'app_dist_rlevel9_dall_shift_ratio', 'app_freq_rlevel10_dall_shift_diff', 'app_freq_rlevel10_dall_shift_ratio', 'app_freq_rlevel1_dall_shift_diff', 'app_freq_rlevel1_dall_shift_ratio', 'app_freq_rlevel2_dall_shift_diff', 'app_freq_rlevel2_dall_shift_ratio', 'app_freq_rlevel3_dall_shift_diff', 'app_freq_rlevel3_dall_shift_ratio', 'app_freq_rlevel4_dall_shift_diff', 'app_freq_rlevel4_dall_shift_ratio', 'app_freq_rlevel5_dall_shift_diff', 'app_freq_rlevel5_dall_shift_ratio', 'app_freq_rlevel6_dall_shift_diff', 'app_freq_rlevel6_dall_shift_ratio', 'app_freq_rlevel7_dall_shift_diff', 'app_freq_rlevel7_dall_shift_ratio', 'app_freq_rlevel8_dall_shift_diff', 'app_freq_rlevel8_dall_shift_ratio', 'app_freq_rlevel9_dall_shift_diff', 'app_freq_rlevel9_dall_shift_ratio'] not found in axis"

In [39]:
cols_in_df_res_not_in_test = df_res.columns[~df_res.columns.isin(test_drop.columns)]
for col in cols_in_df_res_not_in_test:
    print(col)


app_dist_rlevel8_d1_cnt
app_dist_rlevel8_d1_ratio
app_dist_rlevel8_d3_cnt
app_dist_rlevel8_d3_ratio
app_dist_rlevel8_d7_cnt
app_dist_rlevel8_d7_ratio
app_dist_rlevel8_d15_cnt
app_dist_rlevel8_d15_ratio
app_dist_rlevel8_d30_cnt
app_dist_rlevel8_d30_ratio
app_dist_rlevel8_d60_cnt
app_dist_rlevel8_d60_ratio
app_dist_rlevel8_d90_cnt
app_dist_rlevel8_d90_ratio
app_dist_rlevel8_d180_cnt
app_dist_rlevel8_d180_ratio
app_dist_rlevel8_d360_cnt
app_dist_rlevel8_d360_ratio
app_dist_rlevel8_dall_cnt
app_dist_rlevel8_dall_ratio
app_dist_rlevel9_d1_cnt
app_dist_rlevel9_d1_ratio
app_dist_rlevel9_d3_cnt
app_dist_rlevel9_d3_ratio
app_dist_rlevel9_d7_cnt
app_dist_rlevel9_d7_ratio
app_dist_rlevel9_d15_cnt
app_dist_rlevel9_d15_ratio
app_dist_rlevel9_d30_cnt
app_dist_rlevel9_d30_ratio
app_dist_rlevel9_d60_cnt
app_dist_rlevel9_d60_ratio
app_dist_rlevel9_d90_cnt
app_dist_rlevel9_d90_ratio
app_dist_rlevel9_d180_cnt
app_dist_rlevel9_d180_ratio
app_dist_rlevel9_d360_cnt
app_dist_rlevel9_d360_ratio
app_dist_rleve