In [335]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import pandas as pd
import numpy as np
import warnings
import random
from tqdm import tqdm 
from sklearn.cluster import KMeans
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import EasyEnsembleClassifier

warnings.filterwarnings('ignore')
seed = 0

In [6]:
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
seed_everything()

In [7]:
path='zindi-new-user-engagement-prediction-challenge/'
CompetitionPartipation = pd.read_csv(path + "CompetitionPartipation.csv")
UserActivity = pd.read_csv(path + "UserActivity.csv")
Competition = pd.read_csv(path + "Competition.csv")
Blogs = pd.read_csv(path + "Blogs.csv")
SampleSubmission = pd.read_csv(path + "SampleSubmission.csv")
VariableDefinitions = pd.read_csv(path + "VariableDefinitions.csv")
Users = pd.read_csv(path + "Users.csv")
Discussion = pd.read_csv(path + "Discussion.csv")
Jobs = pd.read_csv(path + "Jobs.csv")
Comments = pd.read_csv(path + "Comments.csv")

pd.set_option("max_colwidth", None)
pd.set_option("max_columns", 500)
pd.set_option("max_rows", 500)

In [8]:
# lets reduce the types of activities by removing the ids from them
def remove_id(x):
    x = x.replace('$', '')
    if 'ID' in x:
        return "_".join(x.split('_')[:-1])
    
    return x.replace(' ', '_')


In [9]:
UserActivity['main_activities'] = UserActivity.Title.apply(remove_id)

In [10]:
selected_activities=SampleSubmission['User_ID_Next_month_Activity'].str.replace('_Month_5', '').to_frame().merge(UserActivity, left_on='User_ID_Next_month_Activity', right_on='User_ID')['main_activities'].unique()
UserActivity = UserActivity[UserActivity.main_activities.isin(selected_activities)].reset_index(drop=True)

In [11]:
# aggregate the number of times that a user has taken per month for each type of activities
dataset = UserActivity.groupby(['User_ID', 'main_activities', 'datetime Month']).size().reset_index().\
    rename(columns={0: 'count'}).\
        sort_values(['User_ID', 'datetime Month', 'count'], ascending=False)

dataset    

Unnamed: 0,User_ID,main_activities,datetime Month,count
112843,ID_ZZVUJ45W,Downloaded_Competition_Datafile,3,3
112847,ID_ZZVUJ45W,Viewed_All_Discussions,3,2
112842,ID_ZZVUJ45W,Confirmed_Email,3,1
112844,ID_ZZVUJ45W,Joined_Competition,3,1
112845,ID_ZZVUJ45W,Signed_Up,3,1
...,...,...,...,...
1,ID_000H9XOP,Viewed_All_Competitions,5,1
2,ID_000H9XOP,Viewed_All_Discussions,5,1
3,ID_000H9XOP,comp_ID,5,1
4,ID_000H9XOP,create_alias,5,1


In [12]:
Selected_users = dataset[['User_ID']].drop_duplicates().reset_index(drop=True)
# add ids from SampleSubmission file that are not in the Selected users
# Selected_users = Selected_users.append(pd.DataFrame(SampleSubmission.User_ID_Next_month_Activity.str.replace('_Month_5', '').unique(), columns=['User_ID']), ignore_index=True)
#Selected_users

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Users.Countries_ID = Users.Countries_ID.fillna('unspecified')
Users.Countries_ID = le.fit_transform(Users.Countries_ID)
user_df = pd.merge(Selected_users, Users, on='User_ID',  how='inner')
user_df.drop(columns=['Created At time', 'Created At Year'], inplace=True)
user_df

Unnamed: 0,User_ID,FeatureX,FeatureY,Countries_ID,Created At Month,Created At Day_of_month
0,ID_ZZVUJ45W,0,0,146,3,5
1,ID_ZZUNZJ2U,0,0,146,2,30
2,ID_ZZQK0LU6,0,3,146,3,29
3,ID_ZZMHNLNT,0,0,146,4,1
4,ID_ZZJKFBR5,0,1,143,12,8
...,...,...,...,...,...,...
10395,ID_00ETFE7J,0,0,61,1,6
10396,ID_008D2J6C,0,0,60,1,7
10397,ID_004UKSJV,0,0,69,12,28
10398,ID_003YJHQA,0,1,60,4,12


In [14]:
all_activities = dataset.main_activities.unique()
month_cols = [f'datetime Month_{i}' for i in [1, 2, 3, 4, 5, 11, 12]]
test_ids = SampleSubmission.User_ID_Next_month_Activity.str.replace('_Month_5', '').unique()

activity_df = user_df[['User_ID']]

for activity in all_activities:
    _df = pd.get_dummies(
        dataset[dataset.main_activities == activity].drop(columns='main_activities').reset_index(drop=True),
        columns=['datetime Month'])
    for col in month_cols:
        if col not in _df.columns:
            _df[col] = 0
        _df[col] *= _df['count']
    _df = _df.drop(columns='count')
    # accumulate the counts
    _df = _df.groupby('User_ID').sum().reset_index()
    # merge with all users
    _df = pd.merge(user_df[['User_ID']], _df, on='User_ID', how='left')
    # fill missing values with 0
    _df[month_cols] = _df[month_cols].fillna(0)
    _df.columns = _df.columns.str.replace('datetime Month_', f'{activity}_month_')

    activity_df = pd.merge(activity_df, _df, on='User_ID', how='left')

# merge with users
activity_df = pd.merge(activity_df, user_df, on='User_ID', how='left')

train_df = activity_df[~activity_df.User_ID.isin(test_ids)].reset_index(drop=True)
test_df = activity_df[activity_df.User_ID.isin(test_ids)].reset_index(drop=True)

display(train_df, test_df)

Unnamed: 0,User_ID,Downloaded_Competition_Datafile_month_1,Downloaded_Competition_Datafile_month_2,Downloaded_Competition_Datafile_month_3,Downloaded_Competition_Datafile_month_4,Downloaded_Competition_Datafile_month_5,Downloaded_Competition_Datafile_month_11,Downloaded_Competition_Datafile_month_12,Viewed_All_Discussions_month_1,Viewed_All_Discussions_month_2,Viewed_All_Discussions_month_3,Viewed_All_Discussions_month_4,Viewed_All_Discussions_month_5,Viewed_All_Discussions_month_11,Viewed_All_Discussions_month_12,Confirmed_Email_month_1,Confirmed_Email_month_2,Confirmed_Email_month_3,Confirmed_Email_month_4,Confirmed_Email_month_5,Confirmed_Email_month_11,Confirmed_Email_month_12,Joined_Competition_month_1,Joined_Competition_month_2,Joined_Competition_month_3,Joined_Competition_month_4,Joined_Competition_month_5,Joined_Competition_month_11,Joined_Competition_month_12,Signed_Up_month_1,Signed_Up_month_2,Signed_Up_month_3,Signed_Up_month_4,Signed_Up_month_5,Signed_Up_month_11,Signed_Up_month_12,Viewed_All_Competitions_month_1,Viewed_All_Competitions_month_2,Viewed_All_Competitions_month_3,Viewed_All_Competitions_month_4,Viewed_All_Competitions_month_5,Viewed_All_Competitions_month_11,Viewed_All_Competitions_month_12,Viewed_All_Learning_Pages_month_1,Viewed_All_Learning_Pages_month_2,Viewed_All_Learning_Pages_month_3,Viewed_All_Learning_Pages_month_4,Viewed_All_Learning_Pages_month_5,Viewed_All_Learning_Pages_month_11,Viewed_All_Learning_Pages_month_12,badge_OCZE_month_1,badge_OCZE_month_2,badge_OCZE_month_3,badge_OCZE_month_4,badge_OCZE_month_5,badge_OCZE_month_11,badge_OCZE_month_12,create_alias_month_1,create_alias_month_2,create_alias_month_3,create_alias_month_4,create_alias_month_5,create_alias_month_11,create_alias_month_12,identify_month_1,identify_month_2,identify_month_3,identify_month_4,identify_month_5,identify_month_11,identify_month_12,comp_ID_month_1,comp_ID_month_2,comp_ID_month_3,comp_ID_month_4,comp_ID_month_5,comp_ID_month_11,comp_ID_month_12,Signed_In_month_1,Signed_In_month_2,Signed_In_month_3,Signed_In_month_4,Signed_In_month_5,Signed_In_month_11,Signed_In_month_12,Updated_Discussion_month_1,Updated_Discussion_month_2,Updated_Discussion_month_3,Updated_Discussion_month_4,Updated_Discussion_month_5,Updated_Discussion_month_11,Updated_Discussion_month_12,Viewed_All_Jobs_month_1,Viewed_All_Jobs_month_2,Viewed_All_Jobs_month_3,Viewed_All_Jobs_month_4,Viewed_All_Jobs_month_5,Viewed_All_Jobs_month_11,Viewed_All_Jobs_month_12,job_ID_month_1,job_ID_month_2,job_ID_month_3,job_ID_month_4,job_ID_month_5,job_ID_month_11,job_ID_month_12,Created_Submission_month_1,Created_Submission_month_2,Created_Submission_month_3,Created_Submission_month_4,Created_Submission_month_5,Created_Submission_month_11,Created_Submission_month_12,Signed_Out_month_1,Signed_Out_month_2,Signed_Out_month_3,Signed_Out_month_4,Signed_Out_month_5,Signed_Out_month_11,Signed_Out_month_12,Updated_Profile_month_1,Updated_Profile_month_2,Updated_Profile_month_3,Updated_Profile_month_4,Updated_Profile_month_5,Updated_Profile_month_11,Updated_Profile_month_12,blog_ID_month_1,blog_ID_month_2,blog_ID_month_3,blog_ID_month_4,blog_ID_month_5,blog_ID_month_11,blog_ID_month_12,Joined_Team_month_1,Joined_Team_month_2,Joined_Team_month_3,Joined_Team_month_4,Joined_Team_month_5,Joined_Team_month_11,Joined_Team_month_12,Created_Team_month_1,Created_Team_month_2,Created_Team_month_3,Created_Team_month_4,Created_Team_month_5,Created_Team_month_11,Created_Team_month_12,Updated_Comment_month_1,Updated_Comment_month_2,Updated_Comment_month_3,Updated_Comment_month_4,Updated_Comment_month_5,Updated_Comment_month_11,Updated_Comment_month_12,Viewed_Discussion_month_1,Viewed_Discussion_month_2,Viewed_Discussion_month_3,Viewed_Discussion_month_4,Viewed_Discussion_month_5,Viewed_Discussion_month_11,Viewed_Discussion_month_12,Updated_Submission_month_1,Updated_Submission_month_2,Updated_Submission_month_3,Updated_Submission_month_4,Updated_Submission_month_5,Updated_Submission_month_11,Updated_Submission_month_12,Invited_Member_To_Team_month_2,Invited_Member_To_Team_month_3,Invited_Member_To_Team_month_4,Invited_Member_To_Team_month_5,Invited_Member_To_Team_month_1,Invited_Member_To_Team_month_11,Invited_Member_To_Team_month_12,badge_PLDS_month_2,badge_PLDS_month_3,badge_PLDS_month_4,badge_PLDS_month_5,badge_PLDS_month_1,badge_PLDS_month_11,badge_PLDS_month_12,Viewed_FAQ_month_1,Viewed_FAQ_month_2,Viewed_FAQ_month_3,Viewed_FAQ_month_4,Viewed_FAQ_month_5,Viewed_FAQ_month_11,Viewed_FAQ_month_12,Applied_To_Job_month_1,Applied_To_Job_month_2,Applied_To_Job_month_3,Applied_To_Job_month_4,Applied_To_Job_month_5,Applied_To_Job_month_11,Applied_To_Job_month_12,badge_MLPD_month_1,badge_MLPD_month_2,badge_MLPD_month_3,badge_MLPD_month_4,badge_MLPD_month_5,badge_MLPD_month_11,badge_MLPD_month_12,Accepted_Team_Leadership_Transfer_month_2,Accepted_Team_Leadership_Transfer_month_3,Accepted_Team_Leadership_Transfer_month_4,Accepted_Team_Leadership_Transfer_month_5,Accepted_Team_Leadership_Transfer_month_1,Accepted_Team_Leadership_Transfer_month_11,Accepted_Team_Leadership_Transfer_month_12,Updated_Team_month_1,Updated_Team_month_2,Updated_Team_month_3,Updated_Team_month_4,Updated_Team_month_11,Updated_Team_month_5,Updated_Team_month_12,Deleted_Team_month_1,Deleted_Team_month_2,Deleted_Team_month_3,Deleted_Team_month_4,Deleted_Team_month_5,Deleted_Team_month_12,Deleted_Team_month_11,Votes_(Up/Down)_month_3,Votes_(Up/Down)_month_4,Votes_(Up/Down)_month_5,Votes_(Up/Down)_month_1,Votes_(Up/Down)_month_2,Votes_(Up/Down)_month_11,Votes_(Up/Down)_month_12,FeatureX,FeatureY,Countries_ID,Created At Month,Created At Day_of_month
0,ID_ZZVUJ45W,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,146,3,5
1,ID_ZZUNZJ2U,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,146,2,30
2,ID_ZZQK0LU6,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3,146,3,29
3,ID_ZZJKFBR5,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,143,12,8
4,ID_ZZHDXC1Q,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,146,2,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9657,ID_00RKDLAX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,6,12,23
9658,ID_00ETFE7J,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,61,1,6
9659,ID_008D2J6C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,60,1,7
9660,ID_004UKSJV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,69,12,28


Unnamed: 0,User_ID,Downloaded_Competition_Datafile_month_1,Downloaded_Competition_Datafile_month_2,Downloaded_Competition_Datafile_month_3,Downloaded_Competition_Datafile_month_4,Downloaded_Competition_Datafile_month_5,Downloaded_Competition_Datafile_month_11,Downloaded_Competition_Datafile_month_12,Viewed_All_Discussions_month_1,Viewed_All_Discussions_month_2,Viewed_All_Discussions_month_3,Viewed_All_Discussions_month_4,Viewed_All_Discussions_month_5,Viewed_All_Discussions_month_11,Viewed_All_Discussions_month_12,Confirmed_Email_month_1,Confirmed_Email_month_2,Confirmed_Email_month_3,Confirmed_Email_month_4,Confirmed_Email_month_5,Confirmed_Email_month_11,Confirmed_Email_month_12,Joined_Competition_month_1,Joined_Competition_month_2,Joined_Competition_month_3,Joined_Competition_month_4,Joined_Competition_month_5,Joined_Competition_month_11,Joined_Competition_month_12,Signed_Up_month_1,Signed_Up_month_2,Signed_Up_month_3,Signed_Up_month_4,Signed_Up_month_5,Signed_Up_month_11,Signed_Up_month_12,Viewed_All_Competitions_month_1,Viewed_All_Competitions_month_2,Viewed_All_Competitions_month_3,Viewed_All_Competitions_month_4,Viewed_All_Competitions_month_5,Viewed_All_Competitions_month_11,Viewed_All_Competitions_month_12,Viewed_All_Learning_Pages_month_1,Viewed_All_Learning_Pages_month_2,Viewed_All_Learning_Pages_month_3,Viewed_All_Learning_Pages_month_4,Viewed_All_Learning_Pages_month_5,Viewed_All_Learning_Pages_month_11,Viewed_All_Learning_Pages_month_12,badge_OCZE_month_1,badge_OCZE_month_2,badge_OCZE_month_3,badge_OCZE_month_4,badge_OCZE_month_5,badge_OCZE_month_11,badge_OCZE_month_12,create_alias_month_1,create_alias_month_2,create_alias_month_3,create_alias_month_4,create_alias_month_5,create_alias_month_11,create_alias_month_12,identify_month_1,identify_month_2,identify_month_3,identify_month_4,identify_month_5,identify_month_11,identify_month_12,comp_ID_month_1,comp_ID_month_2,comp_ID_month_3,comp_ID_month_4,comp_ID_month_5,comp_ID_month_11,comp_ID_month_12,Signed_In_month_1,Signed_In_month_2,Signed_In_month_3,Signed_In_month_4,Signed_In_month_5,Signed_In_month_11,Signed_In_month_12,Updated_Discussion_month_1,Updated_Discussion_month_2,Updated_Discussion_month_3,Updated_Discussion_month_4,Updated_Discussion_month_5,Updated_Discussion_month_11,Updated_Discussion_month_12,Viewed_All_Jobs_month_1,Viewed_All_Jobs_month_2,Viewed_All_Jobs_month_3,Viewed_All_Jobs_month_4,Viewed_All_Jobs_month_5,Viewed_All_Jobs_month_11,Viewed_All_Jobs_month_12,job_ID_month_1,job_ID_month_2,job_ID_month_3,job_ID_month_4,job_ID_month_5,job_ID_month_11,job_ID_month_12,Created_Submission_month_1,Created_Submission_month_2,Created_Submission_month_3,Created_Submission_month_4,Created_Submission_month_5,Created_Submission_month_11,Created_Submission_month_12,Signed_Out_month_1,Signed_Out_month_2,Signed_Out_month_3,Signed_Out_month_4,Signed_Out_month_5,Signed_Out_month_11,Signed_Out_month_12,Updated_Profile_month_1,Updated_Profile_month_2,Updated_Profile_month_3,Updated_Profile_month_4,Updated_Profile_month_5,Updated_Profile_month_11,Updated_Profile_month_12,blog_ID_month_1,blog_ID_month_2,blog_ID_month_3,blog_ID_month_4,blog_ID_month_5,blog_ID_month_11,blog_ID_month_12,Joined_Team_month_1,Joined_Team_month_2,Joined_Team_month_3,Joined_Team_month_4,Joined_Team_month_5,Joined_Team_month_11,Joined_Team_month_12,Created_Team_month_1,Created_Team_month_2,Created_Team_month_3,Created_Team_month_4,Created_Team_month_5,Created_Team_month_11,Created_Team_month_12,Updated_Comment_month_1,Updated_Comment_month_2,Updated_Comment_month_3,Updated_Comment_month_4,Updated_Comment_month_5,Updated_Comment_month_11,Updated_Comment_month_12,Viewed_Discussion_month_1,Viewed_Discussion_month_2,Viewed_Discussion_month_3,Viewed_Discussion_month_4,Viewed_Discussion_month_5,Viewed_Discussion_month_11,Viewed_Discussion_month_12,Updated_Submission_month_1,Updated_Submission_month_2,Updated_Submission_month_3,Updated_Submission_month_4,Updated_Submission_month_5,Updated_Submission_month_11,Updated_Submission_month_12,Invited_Member_To_Team_month_2,Invited_Member_To_Team_month_3,Invited_Member_To_Team_month_4,Invited_Member_To_Team_month_5,Invited_Member_To_Team_month_1,Invited_Member_To_Team_month_11,Invited_Member_To_Team_month_12,badge_PLDS_month_2,badge_PLDS_month_3,badge_PLDS_month_4,badge_PLDS_month_5,badge_PLDS_month_1,badge_PLDS_month_11,badge_PLDS_month_12,Viewed_FAQ_month_1,Viewed_FAQ_month_2,Viewed_FAQ_month_3,Viewed_FAQ_month_4,Viewed_FAQ_month_5,Viewed_FAQ_month_11,Viewed_FAQ_month_12,Applied_To_Job_month_1,Applied_To_Job_month_2,Applied_To_Job_month_3,Applied_To_Job_month_4,Applied_To_Job_month_5,Applied_To_Job_month_11,Applied_To_Job_month_12,badge_MLPD_month_1,badge_MLPD_month_2,badge_MLPD_month_3,badge_MLPD_month_4,badge_MLPD_month_5,badge_MLPD_month_11,badge_MLPD_month_12,Accepted_Team_Leadership_Transfer_month_2,Accepted_Team_Leadership_Transfer_month_3,Accepted_Team_Leadership_Transfer_month_4,Accepted_Team_Leadership_Transfer_month_5,Accepted_Team_Leadership_Transfer_month_1,Accepted_Team_Leadership_Transfer_month_11,Accepted_Team_Leadership_Transfer_month_12,Updated_Team_month_1,Updated_Team_month_2,Updated_Team_month_3,Updated_Team_month_4,Updated_Team_month_11,Updated_Team_month_5,Updated_Team_month_12,Deleted_Team_month_1,Deleted_Team_month_2,Deleted_Team_month_3,Deleted_Team_month_4,Deleted_Team_month_5,Deleted_Team_month_12,Deleted_Team_month_11,Votes_(Up/Down)_month_3,Votes_(Up/Down)_month_4,Votes_(Up/Down)_month_5,Votes_(Up/Down)_month_1,Votes_(Up/Down)_month_2,Votes_(Up/Down)_month_11,Votes_(Up/Down)_month_12,FeatureX,FeatureY,Countries_ID,Created At Month,Created At Day_of_month
0,ID_ZZMHNLNT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,146,4,1
1,ID_ZWDD4T41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,146,4,21
2,ID_ZUVTLGY7,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3,146,4,10
3,ID_ZT27LY0Q,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,4,6
4,ID_ZQM6PUGO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,146,4,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
733,ID_04OT901K,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,146,4,9
734,ID_04GIS1QZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,146,4,10
735,ID_02F14MDW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,139,4,14
736,ID_028JJTDW,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,146,4,8


### to create train data 

In [347]:
new_train = pd.DataFrame()
for i in tqdm(range(len(train_df))):
    user = train_df.loc[i]
    created_month = user['Created At Month']
    if created_month not in  [5]:
        user_dict = {}
        user_dict['User_ID'] = user['User_ID']
        for activity in all_activities:
            user_dict[f'{activity}'] = user[f'{activity}_month_{created_month}']
        next_month = [created_month+1, 1][created_month == 12]
        user_dict['Active'] = [0, 1][user[[f'{activity}_month_{next_month}' for activity in all_activities]].sum() > 0]

        new_train = new_train.append(user_dict, ignore_index=True)

new_train

100%|██████████| 9662/9662 [00:46<00:00, 205.74it/s]


Unnamed: 0,User_ID,Downloaded_Competition_Datafile,Viewed_All_Discussions,Confirmed_Email,Joined_Competition,Signed_Up,Viewed_All_Competitions,Viewed_All_Learning_Pages,badge_OCZE,create_alias,identify,comp_ID,Signed_In,Updated_Discussion,Viewed_All_Jobs,job_ID,Created_Submission,Signed_Out,Updated_Profile,blog_ID,Joined_Team,Created_Team,Updated_Comment,Viewed_Discussion,Updated_Submission,Invited_Member_To_Team,badge_PLDS,Viewed_FAQ,Applied_To_Job,badge_MLPD,Accepted_Team_Leadership_Transfer,Updated_Team,Deleted_Team,Votes_(Up/Down),Active
0,ID_ZZVUJ45W,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ID_ZZUNZJ2U,0.0,1.0,1.0,0.0,1.0,3.0,3.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ID_ZZQK0LU6,7.0,4.0,1.0,2.0,1.0,14.0,1.0,1.0,0.0,0.0,35.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ID_ZZJKFBR5,5.0,3.0,1.0,1.0,1.0,3.0,3.0,0.0,1.0,1.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,ID_ZZHDXC1Q,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7901,ID_00SL0NNP,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7902,ID_00RKDLAX,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7903,ID_00ETFE7J,0.0,1.0,1.0,0.0,1.0,0.0,3.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7904,ID_008D2J6C,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### to create test

In [349]:
new_test = pd.DataFrame()
for i in tqdm(range(len(test_df))):
    user = test_df.loc[i]
    created_month = user['Created At Month']
    user_dict = {}
    user_dict['User_ID'] = user['User_ID']
    for activity in all_activities:
        user_dict[f'{activity}'] = user[f'{activity}_month_{created_month}']

    new_test = new_test.append(user_dict, ignore_index=True)

new_test

100%|██████████| 738/738 [00:02<00:00, 298.55it/s]


Unnamed: 0,User_ID,Downloaded_Competition_Datafile,Viewed_All_Discussions,Confirmed_Email,Joined_Competition,Signed_Up,Viewed_All_Competitions,Viewed_All_Learning_Pages,badge_OCZE,create_alias,identify,comp_ID,Signed_In,Updated_Discussion,Viewed_All_Jobs,job_ID,Created_Submission,Signed_Out,Updated_Profile,blog_ID,Joined_Team,Created_Team,Updated_Comment,Viewed_Discussion,Updated_Submission,Invited_Member_To_Team,badge_PLDS,Viewed_FAQ,Applied_To_Job,badge_MLPD,Accepted_Team_Leadership_Transfer,Updated_Team,Deleted_Team,Votes_(Up/Down)
0,ID_ZZMHNLNT,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ID_ZWDD4T41,0.0,1.0,0.0,0.0,1.0,1.0,3.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ID_ZUVTLGY7,4.0,13.0,1.0,1.0,1.0,5.0,3.0,1.0,0.0,0.0,26.0,1.0,0.0,5.0,3.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ID_ZT27LY0Q,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ID_ZQM6PUGO,0.0,3.0,0.0,0.0,1.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
733,ID_04OT901K,2.0,3.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
734,ID_04GIS1QZ,0.0,2.0,1.0,0.0,1.0,7.0,1.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
735,ID_02F14MDW,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
736,ID_028JJTDW,2.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### to extract jobs info

In [350]:
job=UserActivity[UserActivity['Title'].str.split('_', expand=True)[0]=='job']
job['type']=job['Title'].str.replace('job_', '')
job=job.merge(Jobs, left_on='type', right_on='job_ID', how='left')
job=job.merge(Users[['User_ID', 'Created At Month']], how='left')
job=job[job['datetime Month']==job['Created At Month']]
job.drop(['Created At Month'], axis=1, inplace=True)
# to create one hot encode
jobs=pd.get_dummies(job, columns=['Remote', 'Experience', 'Industry', 'Employment Type', 'Company Size', 'Data Science Functions'])
jobs=jobs.merge(job.groupby('User_ID')['datetime Month'].agg(['std', 'var', 'count', 'max']).reset_index(), how='left')
jobs.drop(['Title', 'datetime time', 'datetime Year', 'datetime Month', 'datetime Day_of_month', 'type', 'job_ID'], axis=1, inplace=True)
jobs=jobs.groupby('User_ID').sum().reset_index()
jobs=jobs.merge(job.groupby('User_ID').size().reset_index().rename(columns={0:'count_jobs'}), how='left')
# to remove speacial characters 
jobs.columns=jobs.columns.str.replace('}', '')
jobs.columns=jobs.columns.str.replace('{', '')
jobs.columns=jobs.columns.str.replace('-', '')
jobs.columns=jobs.columns.str.replace('_>', '')
jobs.columns=jobs.columns.str.replace('_<', '')
jobs.columns=jobs.columns.str.replace('"', '')
jobs.columns=jobs.columns.str.replace('/', '')
jobs.columns=jobs.columns.str.replace(',', '_')
jobs.columns=jobs.columns.str.replace(' ', '_')
jobs.head()

Unnamed: 0,User_ID,Remote_False,Remote_True,Experience_12,Experience_25,Experience1,Experience5,Industry_Artificial_Intelligence,Industry_Financial_Services_Banking,Industry_Financial_Services,Industry_Management_Consulting,Industry_Ambassador,Industry_Government_Health_Financial_Services,Industry_Health,Industry_Insurance,Industry_Location,Industry_Logistics,Industry_Manufacturing_Sales,Industry_Media,Industry_Research,Industry_Scholarship,Industry_Technology,Industry_Transportation,Industry_Zindi,Employment_Type_contract,Employment_Type_fulltime,Employment_Type_internship,Employment_Type_parttime,Company_Size_110,Company_Size_1050,Company_Size_50200,Company_Size200,Data_Science_Functions_BusinessData_Analysis_Data_modelling_Data_engineering,Data_Science_Functions_BusinessData_Analysis_Data_modelling,Data_Science_Functions_BusinessData_Analysis,Data_Science_Functions_Data_engineering_AI_infrastructure_BusinessData_Analysis,Data_Science_Functions_Data_engineering_Data_modelling_AI_infrastructure_BusinessData_Analysis,Data_Science_Functions_Data_engineering_Data_modelling_BusinessData_Analysis_AI_infrastructure,Data_Science_Functions_Data_engineering_Data_modelling_BusinessData_Analysis,Data_Science_Functions_Data_engineering_Data_modelling_ML_Application_Deployment_AI_infrastructure,Data_Science_Functions_Data_engineering_Data_modelling_ML_Application_Deployment_BusinessData_Analysis,Data_Science_Functions_Data_engineering_Data_modelling,Data_Science_Functions_Data_engineering_ML_Application_Deployment_BusinessData_Analysis,Data_Science_Functions_Data_engineering_ML_Application_Deployment_Data_modelling,Data_Science_Functions_Data_engineering,Data_Science_Functions_Data_modelling_BusinessData_Analysis_AI_infrastructure,Data_Science_Functions_Data_modelling_BusinessData_Analysis_Data_engineering_ML_Application_Deployment,Data_Science_Functions_Data_modelling_BusinessData_Analysis,Data_Science_Functions_Data_modelling_Data_engineering,Data_Science_Functions_Data_modelling,Data_Science_Functions_ML_Application_Deployment_BusinessData_Analysis_Data_modelling,Data_Science_Functions_ML_Application_Deployment,std,var,count,max,count_jobs
0,ID_01G0BFLB,0,3,0,0,3,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,1,0,0,0,3,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0.0,0.0,9,3,3
1,ID_02MW81ND,0,0,0,0,3,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0.0,0.0,9,3,3
2,ID_07S9C7KP,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.0,0.0,4,2,2
3,ID_0BL248QY,0,0,0,4,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,4,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0.0,0.0,16,20,4
4,ID_0I4F7SVC,0,1,0,0,2,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0.0,0.0,4,4,2


In [351]:
user=Users[['User_ID', 'Created At Month', 'Created At Day_of_month']]
testid=SampleSubmission['User_ID_Next_month_Activity'].str.replace('_Month_5', '').to_frame()
testid.columns=['User_ID']

### Competition data

In [352]:
# correct month order
mon={7:1, 8:2, 9:3, 11:4, 12:5, 1:6, 2:7, 3:8, 4:9, 5:10, 6:11, 7:12}

In [353]:
Competition['End Time Year']=Competition['End Time Year'].fillna(0)
Competition=Competition[Competition['End Time Year']<2]

### to extract info from Competition data

In [354]:
cols=['FeatureA', 'FeatureB', 'FeatureF', 'FeatureG','FeatureH']
for i in cols:
    clean=Competition[i].str.replace(r'[^0-9a-zA-Z]', "", regex=True).str.split('', expand=True)
    clean=clean.fillna(0).replace('', 0)
    Competition[[f'{i}_{j}' for j in clean.columns]]=clean
    Competition=pd.get_dummies(Competition, columns=[f'{i}_{j}' for j in clean.columns])
    Competition=Competition.drop(i, axis=1)

In [355]:
Competition=pd.get_dummies(Competition, columns=['FeatureC', 'FeatureD', 'FeatureE', 'FeatureI'])
Competition.head()

Unnamed: 0,SecretCode,Country_ID,Comp_ID,End Time time,End Time Year,End Time Month,End Time Day_of_month,Start Time time,Start Time Year,Start Time Month,Start Time Day_of_month,FeatureA_0_0,FeatureA_1_0,FeatureA_1_1,FeatureA_1_2,FeatureA_1_3,FeatureA_1_4,FeatureA_1_5,FeatureA_1_7,FeatureA_2_0,FeatureA_2_2,FeatureA_2_3,FeatureA_2_4,FeatureA_2_6,FeatureA_2_7,FeatureA_3_0,FeatureB_0_0,FeatureB_1_0,FeatureB_1_1,FeatureB_1_2,FeatureB_1_5,FeatureB_1_6,FeatureB_1_7,FeatureB_1_9,FeatureB_2_0,FeatureB_2_2,FeatureB_2_4,FeatureB_2_5,FeatureB_2_6,FeatureB_3_0,FeatureB_3_1,FeatureB_4_0,FeatureB_4_6,FeatureB_5_0,FeatureF_0_0,FeatureF_1_0,FeatureF_1_1,FeatureF_1_2,FeatureF_1_3,FeatureF_1_6,FeatureF_1_7,FeatureF_1_8,FeatureF_1_9,FeatureF_2_0,FeatureF_2_0.1,FeatureF_2_1,FeatureF_2_2,FeatureF_2_3,FeatureF_2_5,FeatureF_2_8,FeatureF_3_0,FeatureF_3_1,FeatureF_3_2,FeatureF_4_0,FeatureF_4_1,FeatureF_5_0,FeatureF_5_4,FeatureF_6_0,FeatureG_0_0,FeatureG_1_0,FeatureG_1_3,FeatureG_1_4,FeatureG_1_5,FeatureG_2_0,FeatureG_2_3,FeatureG_3_0,FeatureG_3_4,FeatureG_4_0,FeatureH_0_0,FeatureH_1_0,FeatureH_1_1,FeatureH_1_2,FeatureH_1_3,FeatureH_1_6,FeatureH_1_7,FeatureH_1_8,FeatureH_1_9,FeatureH_2_0,FeatureH_2_0.1,FeatureH_2_1,FeatureH_2_2,FeatureH_2_3,FeatureH_2_8,FeatureH_3_0,FeatureH_3_0.1,FeatureH_3_1,FeatureH_4_0,FeatureC_1.0,FeatureC_2.0,FeatureC_3.0,FeatureC_4.0,FeatureC_5.0,FeatureC_6.0,FeatureC_7.0,FeatureC_8.0,FeatureC_9.0,FeatureC_10.0,FeatureC_11.0,FeatureC_13.0,FeatureC_14.0,FeatureC_15.0,FeatureC_18.0,FeatureC_24.0,FeatureC_25.0,FeatureC_28.0,FeatureC_29.0,FeatureC_31.0,FeatureC_35.0,FeatureD_0,FeatureD_1,FeatureE_1,FeatureE_2,FeatureE_3,FeatureI_1.0,FeatureI_2.0,FeatureI_4.0,FeatureI_5.0,FeatureI_6.0,FeatureI_7.0,FeatureI_8.0,FeatureI_9.0,FeatureI_10.0,FeatureI_11.0,FeatureI_13.0,FeatureI_14.0,FeatureI_15.0,FeatureI_24.0,FeatureI_25.0,FeatureI_28.0,FeatureI_29.0,FeatureI_31.0,FeatureI_33.0,FeatureI_35.0
7,1,DYYF,ID_3MK3,09:30:00,1.0,1.0,24.0,09:30:00,1,1,23,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,1,0,1,0,0,1,0,1,0,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
16,1,,ID_89DI,05:59:00,1.0,3.0,30.0,09:00:00,1,3,29,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,1,0,1,0,0,1,0,1,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
35,0,G3F0,ID_UU4Y,23:59:00,1.0,2.0,3.0,11:00:00,1,2,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,1,0,0,0,1,1,0,1,0,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
36,0,,ID_2MIC,23:59:00,1.0,4.0,25.0,06:00:00,1,1,24,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,1,0,0,0,1,1,0,1,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
76,0,,ID_YDM3,23:59:00,1.0,12.0,1.0,06:00:00,1,9,3,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,1,0,0,1,0,1,0,1,0,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
user_id=[]
active=[]
secret=[]
zendi=[]
country=[]
start_nmonth=[]
secret_startnmonth=[]
zendi_startnmonth=[]
country_startnmonth=[]
goto_nmonth=[]
secret_gotonmonth=[]
zendi_gotonmonth=[]
country_gotonmonth=[]
dict_mon={8:1, 9:2, 10:3, 11:4, 12:5, 1:6, 2:7, 3:8, 4:9, 5:10, 6:11, 7:12}
#CompetitionPartipation['Created At Month']=CompetitionPartipation['Created At Month'].map(dict_mon)
users=Users.copy()
users['Created At Month']=users['Created At Month'].map(dict_mon)
competition=Competition.copy()
competition['End Time Month']=competition['End Time Month'].map(dict_mon)
competition['Start Time Month']=competition['Start Time Month'].map(dict_mon)
competition['End Time Year']=competition['End Time Year'].fillna(0)
competition['End Time Month']=competition['End Time Month'].fillna(0)
competition=competition[competition['End Time Year']<2]
for i in tqdm(users['User_ID']):
    u=users[users['User_ID']==i]['Created At Month']
    uc=users[users['User_ID']==i]['Countries_ID']
    cop=competition[(competition['End Time Month']>=int(u)) & (competition['Start Time Month']<=int(u))]
    copn=competition[(competition['End Time Month']>int(u)) & (competition['Start Time Month']<=int(u))]
    copu=competition[(competition['End Time Month']>int(u)) & (competition['Start Time Month']>int(u))]
    sd=cop[cop['SecretCode']==1]
    zd=cop[cop['SecretCode']==0]
    cd=cop[cop['Country_ID']==str(uc)]
    snd=copn[copn['SecretCode']==1]
    znd=copn[copn['SecretCode']==0]
    cnd=copn[copn['Country_ID']==str(uc)]
    sud=copu[copu['SecretCode']==1]
    zud=copu[copu['SecretCode']==0]
    cud=copu[copu['Country_ID']==str(uc)]
    user_id.append(i)
    active.append(len(cop))
    secret.append(len(sd))
    zendi.append(len(zd))
    country.append(len(cd))
    start_nmonth.append(len(copn))
    secret_startnmonth.append(len(snd))
    zendi_startnmonth.append(len(znd))
    country_startnmonth.append(len(cnd))
    goto_nmonth.append(len(copu))
    secret_gotonmonth.append(len(sud))
    zendi_gotonmonth.append(len(zud))
    country_gotonmonth.append(len(cud))

 74%|███████▍  | 9212/12413 [01:53<00:33, 95.18it/s]

In [None]:
cop_feat=pd.DataFrame()
cop_feat['User_ID']=user_id
cop_feat['current_active_comp']=active
cop_feat['current_active_comp_seceret']=secret
cop_feat['current_active_comp_zendi']=zendi
cop_feat['current_active_comp_country']=country
cop_feat['next_active_comp']=start_nmonth
cop_feat['next_active_comp_seceret']=secret_startnmonth
cop_feat['next_active_comp_zendi']=zendi_startnmonth
cop_feat['next_active_comp_country']=country_startnmonth
cop_feat['upcome_active_comp']=goto_nmonth
cop_feat['upcome_active_comp_seceret']=secret_gotonmonth
cop_feat['upcome_active_comp_zendi']=zendi_gotonmonth
cop_feat['upcome_active_comp_country']=country_gotonmonth
cop_feat.head()

In [None]:
cop_feat.to_csv('cop_feat2.csv', index=False)

### CompetitionPartipation data

In [None]:
CompetitionPartipation=CompetitionPartipation.rename(columns={'Created At Month':'compet_month', 'Created At Day_of_month':'compet_day'})
compet_data=CompetitionPartipation.merge(Users[['User_ID', 'Created At Day_of_month', 'Created At Month', 'Countries_ID']], how='right', on='User_ID')
compid=compet_data[compet_data['Created At Month']+1==compet_data['compet_month']]['User_ID'].drop_duplicates()
compet_data=compet_data[compet_data['Created At Month']==compet_data['compet_month']]
compet=compet_data.merge(Competition, right_on='Comp_ID', left_on='Competition ID', how='left')
compet.drop(['Created At Month', 'Created At Year', ], axis=1, inplace=True)

In [None]:
compet.head(5)

### date of competition 

In [None]:
max_day=Users.groupby(['Created At Month'])['Created At Day_of_month'].max().reset_index().rename(columns={'Created At Day_of_month':'max_day'})
compet=compet.merge(max_day,left_on='compet_month', right_on='Created At Month', how='left').drop('Created At Month', axis=1)
compet['enrolled_week']=compet['compet_day']//7
compet['remain_day_to_next_month_comp']=compet['max_day']-compet['compet_day']
compet['remain_week_to_next_month_comp']=compet['remain_day_to_next_month_comp']//7
compet['diff_comp']=compet['compet_day']-compet['Created At Day_of_month']

In [None]:
compet.head()

### to extract date info and count CompetitionPartipation 

In [None]:
summ_com=compet.groupby('User_ID')['compet_day'].agg(['sum', 'min', 'max', 'count', 'std', 'median', 'var'])
summ_com['range']=summ_com['max']-summ_com['min']
cols=[]
for col in summ_com.columns:
    if col != 'User_ID':
        cols.append(col+'comp')
summ_com.columns=cols

usercomps=compet[['User_ID']].drop_duplicates()
usercomps=usercomps.merge(compet.groupby(['Competition ID', 'User_ID']).size().reset_index().groupby('User_ID')['Competition ID'].size().reset_index().rename(columns={'Competition ID':'no_join_comp'}), how='left')
usercomps=usercomps.merge(compet[compet['SecretCode']==1].groupby(['Competition ID', 'User_ID']).size().reset_index().groupby('User_ID')['Competition ID'].size().reset_index().rename(columns={'Competition ID':'no_join_comp_secert'}), how='left')
usercomps=usercomps.merge(compet[compet['SecretCode']==0].groupby(['Competition ID', 'User_ID']).size().reset_index().groupby('User_ID')['Competition ID'].size().reset_index().rename(columns={'Competition ID':'no_join_comp_notsecert'}), how='left')
usercomps.head()

### users  data

In [None]:
country=Users.merge(new_train[['User_ID', 'Active']],  how='left')
day_proba=country['Created At Day_of_month'].map(country.dropna().groupby(['Created At Day_of_month'])['Active'].mean().to_dict())
country_no=Users.groupby(['Created At Month', 'Countries_ID'])['User_ID'].size().reset_index().rename(columns={'User_ID':'nu_user_by_country'})
userdf=Users.merge(country_no, on=['Countries_ID', 'Created At Month'], how='left')

### users create date

In [None]:
userdf['Created At week_of_month']=userdf['Created At Day_of_month']//7
userdf['Created At week_day_of_month']=userdf['Created At Day_of_month']%7
userdf=userdf.merge(userdf.groupby(['Created At Month'])['Created At Day_of_month'].max().reset_index().rename(columns={'Created At Day_of_month':'max_day'}), on='Created At Month', how='left')
userdf['remain_day_to_next_month']=userdf['max_day']-userdf['Created At Day_of_month']
userdf['remain_week_to_next_month']=userdf['remain_day_to_next_month']//7


In [None]:
userdf['day_proba']=day_proba
userdf['fren_country']=userdf['Countries_ID'].map(userdf['Countries_ID'].value_counts().to_dict()).fillna(0)
userdf['ctreated_day_timestimp']=userdf['Created At Month'].map(mon) * userdf['max_day'] +userdf['Created At Day_of_month']
#userdf['country_mean_encoded']=userdf['Countries_ID'].map(country.dropna().groupby('Countries_ID')['Active'].mean().to_dict())
userdf['Feature_sum']=userdf[['FeatureX', 'FeatureY']].sum(axis=1)

In [None]:
userdf.drop(['Countries_ID', 'Created At Year', 'Created At Month', 'Created At Day_of_month'], axis=1, inplace=True)

### more feature engineering  competion 

In [None]:
compet.head(1)

In [None]:
compet.head()

In [None]:
compet['remain_month_comp_end']=compet['End Time Month']-compet['compet_month']
compet['is_his_country_compet_hosted']=compet['Countries_ID']==compet['Country_ID']
compet['submisson_size']=compet['User_ID'].map(compet.groupby(['User_ID'])['Successful Submission Count'].size().to_dict())

In [None]:
no_compet=[]
continue_month=[]
secret_compet=[]
remain_nmonth=[]
user_id=[]
for ids in tqdm(usercomps['User_ID']):
    df=compet[compet['User_ID']==ids].merge(Users[['User_ID', 'Countries_ID']])
    df=df[df['Countries_ID']==df['Country_ID']]
    nexts=df[df['remain_month_comp_end']>0]
    sec=df[df['SecretCode']==1]
    secn=nexts[nexts['SecretCode']==1]
    no_compet.append(len(df))
    continue_month.append(len(nexts))
    secret_compet.append(len(sec))
    remain_nmonth.append(len(secn))
    user_id.append(ids)
    

In [None]:
infocomp=pd.DataFrame()
infocomp['User_ID']=user_id
infocomp['no_comps_host_own_country']=no_compet
infocomp['no_comps_host_own_country_secret']=continue_month
infocomp['no_comps_host_own_country_remain_next']=secret_compet
infocomp['no_comps_host_own_country_next_scet']=remain_nmonth

In [None]:
compet['remain_month_comp_end_hot']=compet['remain_month_comp_end']!=0
comps=pd.get_dummies(compet, columns=['Country_ID', 'is_his_country_compet_hosted', 'Successful Submission Count', 'SecretCode', 'Participant Type', 'remain_month_comp_end_hot'])
comps['comp id cont']=compet['Competition ID'].map(compet['Competition ID'].value_counts().to_dict()).fillna(0)
comps['countries cont']=compet['Country_ID'].map(compet['Country_ID'].value_counts().to_dict()).fillna(0)

In [None]:
comps.head()

In [None]:
comps=comps.drop(['Created At Day_of_month', 'Created At time', 'compet_month', 'compet_day',  'End Time time', 'End Time Day_of_month', 'Start Time time', 'Start Time Day_of_month', 'Start Time Month', 'Start Time Year', 'End Time Month', 'End Time Year'], axis=1)

In [None]:
comps=comps.groupby('User_ID').sum().reset_index()

### users activity data

In [None]:
useract=UserActivity.merge(Users[['User_ID', 'Created At Month']], how='right', left_on=['User_ID', 'datetime Month'], right_on=['User_ID', 'Created At Month'])
useract=useract[useract['datetime Month']==useract['Created At Month']]
useract.drop('Created At Month', axis=1, inplace=True)
useract=useract.merge(max_day, left_on=['datetime Month'], right_on=['Created At Month'], how='left')


In [None]:
useract['hour']=useract['datetime time'].str.split(':', expand=True)[0].astype(int)
useract['minute']=useract['datetime time'].str.split(':', expand=True)[1].astype(int)
# for hown many hour active
useract['dayhour']=useract['datetime Day_of_month'].astype(str)+useract['hour'].astype(str)
useract['dayhour']=useract['dayhour'].str.replace('.', '')
dayhour=useract.groupby(['User_ID', 'dayhour']).sum().reset_index().groupby('User_ID').size().reset_index().rename(columns={0:'hour_count'})
# for how many minute
useract['minute']=useract['datetime Day_of_month'].astype(str)+useract['hour'].astype(str)+useract['minute'].astype(str)
useract['minute']=useract['minute'].str.replace('.', '')
dayhourminute=useract.groupby(['User_ID', 'minute']).size().reset_index().groupby('User_ID').size().reset_index().rename(columns={0:'minute_count'})


### compet activity

In [None]:
useractcomp=useract[useract['Title'].str.split('_', expand=True)[0]=='comp']
useractcomp['Title']=useractcomp['Title'].str.replace('comp_', '')
useractcomp=useractcomp.merge(Competition[['SecretCode', 'Comp_ID', 'End Time Month', 'Country_ID']], left_on='Title', right_on='Comp_ID', how='left')
useractcomp['remian_month_end_comp']=useractcomp['End Time Month'].map(mon)-useractcomp['datetime Month'].map(mon)
useractcomp=pd.get_dummies(useractcomp, columns=['SecretCode', 'Country_ID', 'remian_month_end_comp'])

In [None]:
usercompday=useractcomp.groupby(['User_ID'])['datetime Day_of_month'].agg(['std', 'sum', 'mean',  'count', 'var', 'max']).reset_index()
usercompday['remian_day_next']=31 -usercompday['max']
usercompday['week_max']=usercompday['max']//7
usercompday['remian_week_next']=usercompday['remian_day_next']//7
cols=[]
for col in usercompday.columns:
    if col != 'User_ID':
        cols.append(col+'compact')
    else:
        cols.append(col)
usercompday.columns=cols
usercompday

In [None]:
useractcomp=useractcomp.groupby('User_ID').sum().reset_index().merge(usercompday, how='left')
cols=[]
for col in useractcomp.columns:
    if col != 'User_ID':
        cols.append(col+'compact')
    else:
        cols.append(col)
useractcomp.columns=cols
useractcomp.drop(['datetime Yearcompact', 'datetime Monthcompact'], axis=1, inplace=True)
useractcomp

In [None]:
actdf=useract.groupby(['User_ID'])['datetime Day_of_month'].agg(['std', 'sum', 'mean', 'count', 'var', 'max']).reset_index()

In [None]:
colsact=new_test.drop('User_ID', axis=1).columns
df_count=Users[['User_ID']]
for col in colsact:
    df_count=df_count.merge(useract[useract['Title']==col].groupby(['User_ID', 'datetime Day_of_month']).size().reset_index().groupby('User_ID')[0].sum().reset_index().rename(columns={0:f'count_day_{col}'}), how='left')
    
df_count['day_act_sum']=df_count.sum(axis=1)
actdf.head(1)

In [None]:
no_day_act=useract.groupby(['User_ID', 'datetime Day_of_month']).size().reset_index().groupby('User_ID').size().reset_index()

### Discussion 

In [None]:
disc=Discussion.rename(columns={'Created At Month':'disc_month', 'Created At Day_of_month':'day_of_disc'})
disc=disc.merge(Users[['User_ID', 'Created At Month', 'Created At Day_of_month']], how='left', on=['User_ID'])
discid=disc[disc['disc_month']==disc['Created At Month']+1]['User_ID'].drop_duplicates()
disc=disc[disc['disc_month']==disc['Created At Month']]
disc.drop(['disc_month'], axis=1, inplace=True)

In [None]:
Competition.head(1)

In [None]:
disc=disc.merge(Competition[['Comp_ID', 'End Time Month', 'SecretCode', 'Country_ID']], left_on='Competition ID', right_on='Comp_ID', how='left')
disc.head()

In [None]:
disc['discused_comp_con']=((disc['End Time Month']-disc['Created At Month'])!=0).astype(int)
disc=pd.get_dummies(disc, columns=['Personal', 'Theme', 'SecretCode', 'Country_ID', 'discused_comp_con'])
disc_df=disc.groupby('User_ID')['day_of_disc'].agg(['std', 'var',  'count']).reset_index()
disc=disc.merge(disc.groupby(['Created At Month', 'Competition ID'])['User_ID'].size().reset_index().rename(columns={'User_ID':'discussed_per_com'}), how='left', on=['Created At Month', 'Competition ID'])
disc=disc.merge(disc.groupby(['Created At Month', 'Disc_ID'])['User_ID'].size().reset_index().rename(columns={'User_ID':'discussed_per_disc'}), how='left', on=['Created At Month', 'Disc_ID'])
disc=disc.groupby('User_ID').sum().reset_index()
disc=disc.merge(disc_df, how='right', on='User_ID').head()
disc=disc.groupby('User_ID').sum().reset_index()
disc=disc.merge(disc_df, how='right', on='User_ID').head()
disc.drop(['day_of_disc', 'Created At Month', 'Created At Day_of_month',  'Created At Year', 'End Time Month'], axis=1, inplace=True)

In [None]:
col=[]
for i in disc.columns:
    if i != 'User_ID':
        col.append('disc_'+i)
    else:
        col.append(i)
        

In [None]:
disc.columns=col
disc.head()

### learning page

In [None]:
blog=UserActivity.copy()
blog['type']=blog['Title'].str.split('_', expand=True)[0]
blog['Title']=UserActivity['Title'].str.replace('blog_', '')
blog=blog[blog['type']=='blog']
blogs=blog
blogs=blogs.merge(Blogs, left_on='Title', right_on='blog_ID', how='left')
blogs=blogs.merge(Users[['User_ID', 'Created At Month']], how='left')
blogs=blogs[blogs['datetime Month']==blogs['Created At Month']]
blogam=blogs.groupby('User_ID')['datetime Day_of_month'].agg(['sum', 'mean', 'std', 'var', 'count', 'max']).reset_index()

In [None]:
cols=[]
for col in blogam.columns:
    if col !='User_ID':
        cols.append('blog_'+col)
    else:
        cols.append(col)
blogam.columns=cols
blogam.head()

infocomp=pd.read_csv('infocomp.csv')
cop_feat=pd.read_csv('cop_feat.csv')

In [None]:
train_df = new_train.reset_index(drop=True)
test_df = new_test.reset_index(drop=True)
main_cols = train_df.columns.difference([
    'User_ID','Active', 
    # 'Signed_Up_month_4`', 'badge_OCZE_month_4', 'create_alias_month_4',
    # 'Confirmed_Email_month_4', 'Viewed_All_Discussions_month_4', 'Viewed_All_Competitions_month_4',
    # 'Viewed_All_Learning_Pages_month_4', 'Updated_Profile_month_4', 'identify_month_4', 'comp_ID_month_4',
    # 'Joined_Competition_month_4', 'Downloaded_Competition_Datafile_month_4', 'Created At Month'
])
X = train_df[main_cols]
y = train_df['Active']
test = test_df[main_cols]
X['source']=0
X['y']=y
test1=test.copy()
test1['source']=1
df=pd.concat([X, test1])
y=df['source']
f=X[X['y']<2]
X=df.drop(['y', 'source'], axis=1)
# train a model to predict the active users
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

model = CatBoostClassifier(
    n_estimators=1000,
    learning_rate=0.01124,
    auto_class_weights='Balanced',
    random_seed=0,
    
    # class_weight='balanced'
)

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)
oof = np.zeros(len(X))
predictions = []

for fold, (trn_idx, val_idx) in enumerate(skfold.split(X, y)):
    print(f'Fold {fold + 1}')
    X_train, y_train = X.iloc[trn_idx], y.iloc[trn_idx]
    X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]
    
    model.fit(
        X_train, y_train,
        eval_set=(X_valid, y_valid),
        verbose=100,
        early_stopping_rounds=100,
    )

    oof[val_idx] = model.predict_proba(X_valid)[:, 1]
    predictions.append(model.predict_proba(test)[:, 1])

predictions = np.mean(predictions, axis=0)
#print(f'Our oof f1 score is {f1_score(y, oof)}')

In [None]:
X.head()

In [None]:
X['t']=y
X['y']=oof
X=X[X['t']==0]
X['User_ID']=new_train['User_ID']
X['Active']=train_df['Active']
X=X[X['y']>0.34]
y=X['Active']
train_users_id=X.drop(['y', 't'], axis=1)[['User_ID']]

In [None]:
train_users_id.shape

### more feature engineering on activity

In [None]:
colsact=new_test.drop('User_ID', axis=1).columns
new_train['allact_sum']=new_train[colsact].sum(axis=1)
new_train['allact_mean']=new_train[colsact].mean(axis=1)
new_train['allact_median']=new_train[colsact].median(axis=1)
new_test['allact_sum']=new_test[colsact].sum(axis=1)
new_test['allact_mean']=new_test[colsact].mean(axis=1)
new_test['allact_median']=new_test[colsact].median(axis=1)
new_train['subact']=new_train['Created_Submission']*new_train['allact_sum']
new_test['subact']=new_test['Created_Submission']*new_test['allact_sum']
new_train=new_train.merge(user, how='left')
new_test=new_test.merge(user, how='left')
new_train.loc[new_train['Created At Day_of_month']<6, 'day_group']=1
new_train.loc[(new_train['Created At Day_of_month']>=6) & (new_train['Created At Day_of_month']<10), 'day_group']=2
new_train.loc[(new_train['Created At Day_of_month']>=10) &(new_train['Created At Day_of_month']<15), 'day_group']=3
new_train.loc[new_train['Created At Day_of_month']>14, 'day_group']=4
new_test.loc[new_test['Created At Day_of_month']<6, 'day_group']=1
new_test.loc[(new_test['Created At Day_of_month']>=6) & (new_test['Created At Day_of_month']<10), 'day_group']=2
new_test.loc[(new_test['Created At Day_of_month']>=10) &(new_test['Created At Day_of_month']<15), 'day_group']=3
new_test.loc[new_test['Created At Day_of_month']>14, 'day_group']=4
cl=KMeans(n_clusters=8)
cl.fit(pd.concat([new_train[colsact], new_test[colsact]]))
new_train['cluster_allact']=cl.predict(new_train[colsact])
new_test['cluster_allact']=cl.predict(new_test[colsact])
col_selected=['Joined_Competition', 'Downloaded_Competition_Datafile', 'Created_Submission']
new_train['comp_sum']=new_train[col_selected].sum(axis=1)
new_train['comp_mean']=new_train[col_selected].mean(axis=1)
new_train['comp_median']=new_train[col_selected].median(axis=1)
new_train['comp_q1']=new_train[col_selected].quantile(0.75, axis=1)
new_train['comp_q2']=new_train[col_selected].quantile(0.85, axis=1)
new_train['comp_q3']=new_train[col_selected].quantile(0.95, axis=1)
new_test['comp_sum']=new_test[col_selected].sum(axis=1)
new_test['comp_mean']=new_test[col_selected].mean(axis=1)
new_test['comp_median']=new_test[col_selected].median(axis=1)
new_test['comp_q1']=new_test[col_selected].quantile(0.75, axis=1)
new_test['comp_q2']=new_test[col_selected].quantile(0.85, axis=1)
new_test['comp_q3']=new_test[col_selected].quantile(0.5, axis=1)
from sklearn.cluster import KMeans
cl=KMeans(n_clusters=4)
cl.fit(pd.concat([new_train[col_selected], new_test[col_selected]]))
new_train['cluster_allcomp']=cl.predict(new_train[col_selected])
new_test['cluster_allcomp']=cl.predict(new_test[col_selected])

cols_disc=['Updated_Discussion',  'Updated_Comment', 'Viewed_Discussion']
new_train['disc_sum']=new_train[cols_disc].sum(axis=1)
new_train['disc_mean']=new_train[cols_disc].mean(axis=1)
new_train['disc_median']=new_train[cols_disc].median(axis=1)
new_train['disc_q1']=new_train[cols_disc].quantile(0.25, axis=1)
new_train['disc_q2']=new_train[cols_disc].quantile(0.5, axis=1)
new_train['disc_q3']=new_train[cols_disc].quantile(0.75, axis=1)
new_train['disc_q4']=new_train[cols_disc].quantile(0.85, axis=1)
new_test['disc_sum']=new_test[cols_disc].sum(axis=1)
new_test['disc_mean']=new_test[cols_disc].mean(axis=1)
new_test['disc_median']=new_test[cols_disc].median(axis=1)
new_test['disc_q1']=new_test[cols_disc].quantile(0.25, axis=1)
new_test['disc_q2']=new_test[cols_disc].quantile(0.5, axis=1)
new_test['disc_q3']=new_test[cols_disc].quantile(0.75, axis=1)
new_test['disc_q4']=new_test[cols_disc].quantile(0.85, axis=1)
cl=KMeans(n_clusters=4)
cl.fit(pd.concat([new_train[cols_disc], new_test[cols_disc]]))
new_train['cluster_alldisc']=cl.predict(new_train[cols_disc])
new_test['cluster_alldisc']=cl.predict(new_test[cols_disc])


### merge all data

In [None]:
testid=testid.merge(cop_feat, how='left')
testid=testid.merge(infocomp, how='left')
new_train=new_train.merge(cop_feat, how='left')
new_test=new_test.merge(cop_feat, how='left')
new_train=new_train.merge(infocomp, how='left')
new_test=new_test.merge(infocomp, how='left')
new_train=new_train.merge(jobs, how='left')
new_test=new_test.merge(jobs, how='left')
new_train=new_train.merge(userdf, how='left', on='User_ID')
new_test=new_test.merge(userdf, how='left', on='User_ID')
new_train=new_train.merge(df_count, how='left', on='User_ID')
new_test=new_test.merge(df_count, how='left', on='User_ID')
new_train=new_train.merge(usercomps, how='left', on='User_ID')
new_test=new_test.merge(usercomps, how='left', on='User_ID')
new_train=new_train.merge(summ_com, how='left', on='User_ID')
new_test=new_test.merge(summ_com, how='left', on='User_ID')
testid=testid.merge(userdf, how='left', on='User_ID')
new_train=new_train.merge(actdf, how='left', on='User_ID')
new_test=new_test.merge(actdf, how='left', on='User_ID')
new_train=new_train.merge(blogam, how='left', on='User_ID')
new_test=new_test.merge(blogam, how='left', on='User_ID')
testid=testid.merge(usercomps, how='left', on='User_ID')
testid=testid.merge(summ_com, how='left', on='User_ID')
new_train=new_train.merge(disc, how='left', on='User_ID')
new_test=new_test.merge(disc, how='left', on='User_ID')
new_train=new_train.merge(dayhour, how='left', on='User_ID')
new_test=new_test.merge(dayhour, how='left', on='User_ID')
testid=testid.merge(disc, how='left', on='User_ID')
new_train=new_train.merge(useractcomp, how='left', on='User_ID')
new_test=new_test.merge(useractcomp, how='left', on='User_ID')
new_train=new_train.merge(no_day_act, how='left', on='User_ID')
new_test=new_test.merge(no_day_act, how='left', on='User_ID')
new_train=new_train.merge(comps, how='left', on='User_ID')
new_test=new_test.merge(comps, how='left', on='User_ID')
testid=testid.merge(comps, how='left', on='User_ID')
train=new_train.merge(dayhourminute, how='left', on='User_ID')
test=new_test.merge(dayhourminute, how='left', on='User_ID')
testid=testid.merge(user, how='left', on='User_ID')

In [None]:
train_df=train.copy()
test_df=test.copy()
testid_df=testid.copy()

In [None]:
#train=train_df
#test=test_df
#testid=testid_df

In [None]:
train1=train.merge(train_users_id, how='right', on='User_ID')
testid=testid[~testid['User_ID'].isin(test['User_ID'])]

In [None]:
train1=train1[~train1['User_ID'].isin(train1[train1['allact_sum']==0]['User_ID'])]
cols=['Viewed_All_Competitions', 'Created_Submission', 'comp_ID']
train1['comp_sums']=train1[cols].sum(axis=1)
test['comp_sums']=test[cols].sum(axis=1)

In [None]:
train=train[train['Created At Month'].isin([1,  2, 3])]
train1.drop(['Created At Month', 'Created At time'],  axis=1, inplace=True)
train.drop(['Created At Month', 'Created At time'],  axis=1, inplace=True)
test.drop(['Created At Month', 'Created At time'], axis=1, inplace=True)
testid.drop(['Created At Month', 'Created At time'], axis=1, inplace=True)

In [None]:
train1['Active'].value_counts()

In [None]:
na=0
train=train.fillna(0)
train1=train1.fillna(na)
test=test.fillna(na)
testid=testid.fillna(0)

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
lst_cat=[feature for feature in train.select_dtypes('O').columns if  feature != 'User_ID' ]
for i in lst_cat:
    train[i]=le.fit_transform(train[i].astype(str))
    train1[i]=le.fit_transform(train1[i].astype(str))
    test[i]=le.fit_transform(test[i].astype(str))
    
    try:
        testid[i]=le.fit_transform(testid[i].astype(str))
    except:
        continue


In [None]:
train2=train[(train['Created At Day_of_month']>21) & (train['Created At Day_of_month']<30)]
train3=train[train['Created At Day_of_month']<22]
test2=testid[testid['Created At Day_of_month']>21]
test3=testid[testid['Created At Day_of_month']<22]
train1=train1[train1['Created At Day_of_month']<22]

In [None]:
X1=train1.drop(['User_ID', 'Active'], axis=1)
X2=train2.drop(['User_ID', 'Active'], axis=1)
X3=train3.drop(['User_ID', 'Active'], axis=1)
y1=train1['Active']
y2=train2['Active']
y3=train3['Active']
test_id=test['User_ID']
test_id2=test2['User_ID']
test_id3=test3['User_ID']
test=test.drop('User_ID', axis=1)
test2=test2.drop('User_ID', axis=1)
test3=test3.drop('User_ID', axis=1)

In [None]:
from sklearn.metrics import roc_curve,roc_auc_score, precision_recall_curve, average_precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [None]:
X1=X1.drop(['FeatureB_2_0', 'FeatureF_2_0', 'FeatureH_3_0', 'FeatureH_2_0', 'FeatureA_3_0'], axis=1)
test=test.drop(['FeatureB_2_0','FeatureF_2_0', 'FeatureH_3_0', 'FeatureH_2_0', 'FeatureA_3_0'], axis=1)
cl=KMeans(n_clusters=4)
cl.fit(pd.concat([X1, test]))
X1['clusterall']=cl.predict(X1)
test['clusterall']=cl.predict(test)

In [None]:
# BalancedRandomForestClassifier
model1 = BalancedRandomForestClassifier(n_estimators=20)
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
oof = np.zeros(len(X1))
predictions1 = []
for fold, (trn_idx, val_idx) in enumerate(skfold.split(X1, y1)):
    print(f'Fold {fold + 1}')
    X_train, y_train = X1.iloc[trn_idx], y1.iloc[trn_idx]
    X_valid, y_test = X1.iloc[val_idx], y1.iloc[val_idx]
    
    model1.fit(
        X_train, y_train
       
    )

    oof[val_idx] = model1.predict_proba(X_valid)[:, 1]
    predictions1.append(model1.predict_proba(test)[:, 1])
predictions1 = np.mean(predictions1, axis=0)
#X1['rf']=oof
#test['rf']=predictions1
# train on logestic regression
model1 = LogisticRegression(class_weight='balanced',
)
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(X1))
predictions1 = []
for fold, (trn_idx, val_idx) in enumerate(skfold.split(X1, y1)):
    print(f'Fold {fold + 1}')
    X_train, y_train = X1.iloc[trn_idx], y1.iloc[trn_idx]
    X_valid, y_test = X1.iloc[val_idx], y1.iloc[val_idx]
    
    model1.fit(
        X_train, y_train
    )

    oof[val_idx] = model1.predict_proba(X_valid)[:, 1]
    predictions1.append(model1.predict_proba(test)[:, 1])

predictions1 = np.mean(predictions1, axis=0)
X1['lg']=oof
test['lg']=predictions1

# train catboost 


model1 = CatBoostClassifier(
    n_estimators=1000,
    learning_rate=0.0100800800100051124,
    depth=7,
    random_seed=0,
    auto_class_weights='Balanced',
    #class_weights='balanced',
)

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(X1))
predictions1 = []
for fold, (trn_idx, val_idx) in enumerate(skfold.split(X1, y1)):
    print(f'Fold {fold + 1}')
    X_train, y_train = X1.iloc[trn_idx], y1.iloc[trn_idx]
    X_valid, y_test = X1.iloc[val_idx], y1.iloc[val_idx]
    
    model1.fit(
        X_train, y_train,
        eval_set=(X_valid, y_test),
        verbose=100,
        early_stopping_rounds=100,
    )

    oof[val_idx] = model1.predict_proba(X_valid)[:, 1]
    predictions1.append(model1.predict_proba(test)[:, 1])

predictions1 = np.mean(predictions1, axis=0)

X1['cat']=oof
test['cat']=predictions1
X1['weighted_avg']=X1[['lg',  'cat']].mean(axis=1)
test['weighted_avg']=test[['lg', 'cat']].mean(axis=1)
X1['y']=y1
X1['diff']=X1['y']-oof
X1=X1[X1['diff']>-0.85]
y1=X1['y']
X1=X1.drop(['diff', 'y'], axis=1)

# 

model1 = CatBoostClassifier(
    n_estimators=2000,
    learning_rate=0.0010050000800100051124,
    depth=9,
    random_seed=0,
    auto_class_weights='Balanced',
    #class_weights='balanced',
)

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(X1))
predictions1 = []
for fold, (trn_idx, val_idx) in enumerate(skfold.split(X1, y1)):
    print(f'Fold {fold + 1}')
    X_train, y_train = X1.iloc[trn_idx], y1.iloc[trn_idx]
    X_valid, y_test = X1.iloc[val_idx], y1.iloc[val_idx]
    
    model1.fit(
        X_train, y_train,
        eval_set=(X_valid, y_test),
        verbose=100,
        early_stopping_rounds=100,
    )

    oof[val_idx] = model1.predict(X_valid)
    y_predicted=model1.predict(X_valid)
    print('Classifier report:\n',classification_report(y_test,y_predicted))
    print('Confusion matrix:\n',confusion_matrix(y_test,y_predicted))
    predictions1.append(model1.predict_proba(test)[:, 1])

predictions1 = np.mean(predictions1, axis=0)
print(f'Our oof f1 score is {f1_score(y1, oof)}')

In [None]:
X2=X2[test2.columns]
X2=X2.drop(['FeatureB_2_0', 'FeatureF_2_0', 'FeatureH_3_0', 'FeatureH_2_0', 'FeatureA_3_0'], axis=1)
test2=test2.drop(['FeatureB_2_0','FeatureF_2_0', 'FeatureH_3_0', 'FeatureH_2_0', 'FeatureA_3_0'], axis=1)
cl=KMeans(n_clusters=4)
cl.fit(pd.concat([X2, test2]))
X2['clusterall']=cl.predict(X2)
test2['clusterall']=cl.predict(test2)

In [None]:
# train a model to predict the active users
# BalancedRandomForestClassifier
model2 = BalancedRandomForestClassifier(n_estimators=20)
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
oof = np.zeros(len(X2))
predictions2 = []
for fold, (trn_idx, val_idx) in enumerate(skfold.split(X2, y2)):
    print(f'Fold {fold + 1}')
    X_train, y_train = X2.iloc[trn_idx], y2.iloc[trn_idx]
    X_valid, y_test = X2.iloc[val_idx], y2.iloc[val_idx]
    
    model2.fit(
        X_train, y_train
       
    )

    oof[val_idx] = model2.predict_proba(X_valid)[:, 1]
    predictions2.append(model2.predict_proba(test2)[:, 1])
predictions2 = np.mean(predictions2, axis=0)
#X2['rf']=oof
#test2['rf']=predictions2
model2 = LogisticRegression(class_weight='balanced',
)

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(X2))
predictions2 = []
for fold, (trn_idx, val_idx) in enumerate(skfold.split(X2, y2)):
    print(f'Fold {fold + 1}')
    X_train, y_train = X2.iloc[trn_idx], y2.iloc[trn_idx]
    X_valid, y_test = X2.iloc[val_idx], y2.iloc[val_idx]
    
    model2.fit(
        X_train, y_train
    )

    oof[val_idx] = model2.predict(X_valid)
predictions2 = np.mean(predictions2, axis=0)
X2['lg']=oof
test2['lg']=predictions2
# train on  catboost 
model2 = CatBoostClassifier(
    n_estimators=1000,
    learning_rate=0.0100800100051124,
    depth=7,
    random_seed=0,
    auto_class_weights='Balanced',
    #class_weights='balanced',
)

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(X2))
predictions2 = []
for fold, (trn_idx, val_idx) in enumerate(skfold.split(X2, y2)):
    print(f'Fold {fold + 1}')
    X_train, y_train = X2.iloc[trn_idx], y2.iloc[trn_idx]
    X_valid, y_test = X2.iloc[val_idx], y2.iloc[val_idx]
    
    model2.fit(
        X_train, y_train,
        eval_set=(X_valid, y_test),
        verbose=100,
        early_stopping_rounds=100,
    )

    oof[val_idx] = model2.predict_proba(X_valid)[:, 1]
    predictions2.append(model2.predict_proba(test2)[:, 1])
predictions2 = np.mean(predictions2, axis=0)
X2['cat']=oof
test2['cat']=predictions2
X2['weighted_avg']=X2[['lg',  'cat']].mean(axis=1)
test2['weighted_avg']=test2[['lg', 'cat']].mean(axis=1)
# train again on catboost
X2['y']=y2
X2['diff']=X2['y']-oof
X2=X2[X2['diff']>-0.65]
y2=X2['y']
X2=X2.drop(['diff', 'y'], axis=1)
model2 = CatBoostClassifier(
    n_estimators=2000,
    learning_rate=0.00100800100051124,
    depth=9,
    random_seed=0,
    auto_class_weights='Balanced',
    #class_weights='balanced',
)

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(X2))
predictions2 = []
for fold, (trn_idx, val_idx) in enumerate(skfold.split(X2, y2)):
    print(f'Fold {fold + 1}')
    X_train, y_train = X2.iloc[trn_idx], y2.iloc[trn_idx]
    X_valid, y_test = X2.iloc[val_idx], y2.iloc[val_idx]
    
    model2.fit(
        X_train, y_train,
        eval_set=(X_valid, y_test),
        verbose=100,
        early_stopping_rounds=100,
    )

    oof[val_idx] = model2.predict(X_valid)
    y_predicted=model2.predict(X_valid)
    print('Classifier report:\n',classification_report(y_test,y_predicted))
    print('Confusion matrix:\n',confusion_matrix(y_test,y_predicted))
    predictions2.append(model2.predict_proba(test2)[:, 1])

predictions2 = np.mean(predictions2, axis=0)
print(f'Our oof f1 score is {f1_score(y2, oof)}')

In [None]:
X3=X3[test3.columns]
X3=X3.drop(['FeatureB_2_0', 'FeatureF_2_0', 'FeatureH_3_0', 'FeatureH_2_0', 'FeatureA_3_0'], axis=1)
test3=test3.drop(['FeatureB_2_0','FeatureF_2_0', 'FeatureH_3_0', 'FeatureH_2_0', 'FeatureA_3_0'], axis=1)


In [None]:
# train a model to predict the active users
model3 = CatBoostClassifier(
  n_estimators=1000,
    learning_rate=0.0100100051124,
     depth=7,
     random_seed=0,
    #auto_class_weights='Balanced',
    #class_weights='balanced',
)

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
oof = np.zeros(len(X3))
predictions3 = []
for fold, (trn_idx, val_idx) in enumerate(skfold.split(X3, y3)):
    print(f'Fold {fold + 1}')
    X_train, y_train = X3.iloc[trn_idx], y3.iloc[trn_idx]
    X_valid, y_test = X3.iloc[val_idx], y3.iloc[val_idx]
    
    model3.fit(
        X_train, y_train,
        eval_set=(X_valid, y_test),
        verbose=100,
        early_stopping_rounds=100,
    )

    oof[val_idx] = model3.predict(X_valid)
    y_predicted=model3.predict(X_valid)
    print('Classifier report:\n',classification_report(y_test,y_predicted))
    print('Confusion matrix:\n',confusion_matrix(y_test,y_predicted))
    predictions3.append(model3.predict_proba(test3)[:, 1])

predictions3 = np.mean(predictions3, axis=0)
print(f'Our oof f1 score is {f1_score(y3, oof)}')

In [None]:
# plot model1 feature importance

import matplotlib.pyplot as plt
import seaborn as sns

feat_importances = pd.Series(model1.feature_importances_, index=X1.columns)
plt.figure(figsize=(20, 25))
feat_importances.nlargest(60).plot(kind='barh')
plt.show()

In [None]:
# plot model2 feature importance

import matplotlib.pyplot as plt
import seaborn as sns

feat_importances = pd.Series(model2.feature_importances_, index=X2.columns)
plt.figure(figsize=(20, 25))
feat_importances.nlargest(60).plot(kind='barh')
plt.show()

In [None]:
# plot model3 feature importance

import matplotlib.pyplot as plt
import seaborn as sns

feat_importances = pd.Series(model3.feature_importances_, index=X3.columns)
plt.figure(figsize=(20, 25))
feat_importances.nlargest(60).plot(kind='barh')
plt.show()

In [None]:
sub1 = pd.DataFrame({
    'User_ID_Next_month_Activity': test_id,
    'Active': predictions1
})
sub2 = pd.DataFrame({
    'User_ID_Next_month_Activity': test_id2,
    'Active': predictions2
})
sub3 = pd.DataFrame({
    'User_ID_Next_month_Activity': test_id3,
    'Active': predictions3
})

In [None]:
sub1.Active.describe()

In [None]:
sub2.Active.describe()

In [None]:
sub1.Active = sub1.Active.apply(lambda x: 1 if x > 0.5 else 0)
sub2.Active = sub2.Active.apply(lambda x: 1 if x > 0.65 else 0)
sub3.Active = sub3.Active.apply(lambda x: 1 if x > 1 else 0)
display(sub1.Active.value_counts())
sub2.Active.value_counts()

In [None]:
specail=test_id2.to_frame().merge(CompetitionPartipation).groupby('User_ID').size().reset_index().merge(Discussion).groupby('User_ID').size().reset_index()
sub2.loc[sub2['User_ID_Next_month_Activity'].isin(specail['User_ID'])]

In [None]:
sub=pd.concat([sub1, sub2, sub3])

In [None]:
sub.User_ID_Next_month_Activity = sub.User_ID_Next_month_Activity + '_Month_5'

In [None]:
sub.Active.value_counts()

In [None]:
sub.to_csv('submission.csv', index=False)