In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
people_path = '/content/drive/MyDrive/B.Tech/kaggle/predicting-red-hat-business-value/people.csv'
act_test_path = '/content/drive/MyDrive/B.Tech/kaggle/predicting-red-hat-business-value/act_test.csv'

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime
import joblib

In [4]:
df_people = pd.read_csv(people_path)
df_test = pd.read_csv(act_test_path)

In [5]:
df_null = pd.DataFrame(df_test.isnull().sum(), columns=['Null count'])
df_null['Null %'] = (df_null['Null count']/df_test.shape[0])*100
df_null

Unnamed: 0,Null count,Null %
people_id,0,0.0
activity_id,0,0.0
date,0,0.0
activity_category,0,0.0
char_1,458595,91.960488
char_2,458595,91.960488
char_3,458595,91.960488
char_4,458595,91.960488
char_5,458595,91.960488
char_6,458595,91.960488


In [6]:
df_test1 = df_test.drop(['char_1', 'char_2', 'char_3', 'char_4', 'char_5', 'char_6',
                           'char_7', 'char_8', 'char_9'], axis = 1)
df_test1

Unnamed: 0,people_id,activity_id,date,activity_category,char_10
0,ppl_100004,act1_249281,2022-07-20,type 1,
1,ppl_100004,act2_230855,2022-07-20,type 5,type 682
2,ppl_10001,act1_240724,2022-10-14,type 1,
3,ppl_10001,act1_83552,2022-11-27,type 1,
4,ppl_10001,act2_1043301,2022-10-15,type 5,type 3015
...,...,...,...,...,...
498682,ppl_99997,act2_4367092,2023-04-22,type 2,type 1
498683,ppl_99997,act2_4404220,2022-11-12,type 2,type 1
498684,ppl_99997,act2_448830,2022-08-02,type 2,type 1
498685,ppl_99997,act2_450133,2022-08-02,type 2,type 1


In [7]:
df_test1.isnull().sum()

people_id                0
activity_id              0
date                     0
activity_category        0
char_10              40092
dtype: int64

In [8]:
def convert_str_to_num(x):
    t = x.split(' ')
    return int(t[1])

In [9]:
act_char = ['activity_category']
for i in act_char:
    df_test1[i] = df_test1[i].apply(convert_str_to_num)
df_test1

Unnamed: 0,people_id,activity_id,date,activity_category,char_10
0,ppl_100004,act1_249281,2022-07-20,1,
1,ppl_100004,act2_230855,2022-07-20,5,type 682
2,ppl_10001,act1_240724,2022-10-14,1,
3,ppl_10001,act1_83552,2022-11-27,1,
4,ppl_10001,act2_1043301,2022-10-15,5,type 3015
...,...,...,...,...,...
498682,ppl_99997,act2_4367092,2023-04-22,2,type 1
498683,ppl_99997,act2_4404220,2022-11-12,2,type 1
498684,ppl_99997,act2_448830,2022-08-02,2,type 1
498685,ppl_99997,act2_450133,2022-08-02,2,type 1


In [10]:
df_test1.rename(columns = {"char_10": "activity_char"}, inplace = True)
df_test1

Unnamed: 0,people_id,activity_id,date,activity_category,activity_char
0,ppl_100004,act1_249281,2022-07-20,1,
1,ppl_100004,act2_230855,2022-07-20,5,type 682
2,ppl_10001,act1_240724,2022-10-14,1,
3,ppl_10001,act1_83552,2022-11-27,1,
4,ppl_10001,act2_1043301,2022-10-15,5,type 3015
...,...,...,...,...,...
498682,ppl_99997,act2_4367092,2023-04-22,2,type 1
498683,ppl_99997,act2_4404220,2022-11-12,2,type 1
498684,ppl_99997,act2_448830,2022-08-02,2,type 1
498685,ppl_99997,act2_450133,2022-08-02,2,type 1


In [11]:
df_test2 = pd.merge(df_test1, df_people, how = 'left', on = ['people_id'])
df_test2

Unnamed: 0,people_id,activity_id,date_x,activity_category,activity_char,char_1,group_1,char_2,date_y,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100004,act1_249281,2022-07-20,1,,type 2,group 22593,type 3,2022-07-20,type 40,type 25,type 9,type 4,type 16,type 2,type 2,True,True,True,True,True,False,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,True,True,True,76
1,ppl_100004,act2_230855,2022-07-20,5,type 682,type 2,group 22593,type 3,2022-07-20,type 40,type 25,type 9,type 4,type 16,type 2,type 2,True,True,True,True,True,False,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,True,True,True,76
2,ppl_10001,act1_240724,2022-10-14,1,,type 2,group 25417,type 3,2022-10-14,type 6,type 6,type 4,type 1,type 1,type 2,type 2,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,90
3,ppl_10001,act1_83552,2022-11-27,1,,type 2,group 25417,type 3,2022-10-14,type 6,type 6,type 4,type 1,type 1,type 2,type 2,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,90
4,ppl_10001,act2_1043301,2022-10-15,5,type 3015,type 2,group 25417,type 3,2022-10-14,type 6,type 6,type 4,type 1,type 1,type 2,type 2,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498682,ppl_99997,act2_4367092,2023-04-22,2,type 1,type 2,group 17304,type 2,2022-03-12,type 40,type 25,type 9,type 3,type 8,type 6,type 6,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,36
498683,ppl_99997,act2_4404220,2022-11-12,2,type 1,type 2,group 17304,type 2,2022-03-12,type 40,type 25,type 9,type 3,type 8,type 6,type 6,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,36
498684,ppl_99997,act2_448830,2022-08-02,2,type 1,type 2,group 17304,type 2,2022-03-12,type 40,type 25,type 9,type 3,type 8,type 6,type 6,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,36
498685,ppl_99997,act2_450133,2022-08-02,2,type 1,type 2,group 17304,type 2,2022-03-12,type 40,type 25,type 9,type 3,type 8,type 6,type 6,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,36


In [12]:
df_test2['date_x'] = pd.to_datetime(df_test2['date_x'])
df_test2['date_y'] = pd.to_datetime(df_test2['date_y'])

print(type(df_test2['date_x'][0]))
print(type(df_test2['date_y'][0]))

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [13]:
df_test2['days_difference'] = (df_test2['date_x'] - df_test2['date_y']).apply(lambda x: x.days)
df_test2

Unnamed: 0,people_id,activity_id,date_x,activity_category,activity_char,char_1,group_1,char_2,date_y,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,days_difference
0,ppl_100004,act1_249281,2022-07-20,1,,type 2,group 22593,type 3,2022-07-20,type 40,type 25,type 9,type 4,type 16,type 2,type 2,True,True,True,True,True,False,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,True,True,True,76,0
1,ppl_100004,act2_230855,2022-07-20,5,type 682,type 2,group 22593,type 3,2022-07-20,type 40,type 25,type 9,type 4,type 16,type 2,type 2,True,True,True,True,True,False,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,True,True,True,76,0
2,ppl_10001,act1_240724,2022-10-14,1,,type 2,group 25417,type 3,2022-10-14,type 6,type 6,type 4,type 1,type 1,type 2,type 2,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,90,0
3,ppl_10001,act1_83552,2022-11-27,1,,type 2,group 25417,type 3,2022-10-14,type 6,type 6,type 4,type 1,type 1,type 2,type 2,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,90,44
4,ppl_10001,act2_1043301,2022-10-15,5,type 3015,type 2,group 25417,type 3,2022-10-14,type 6,type 6,type 4,type 1,type 1,type 2,type 2,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,90,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498682,ppl_99997,act2_4367092,2023-04-22,2,type 1,type 2,group 17304,type 2,2022-03-12,type 40,type 25,type 9,type 3,type 8,type 6,type 6,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,36,406
498683,ppl_99997,act2_4404220,2022-11-12,2,type 1,type 2,group 17304,type 2,2022-03-12,type 40,type 25,type 9,type 3,type 8,type 6,type 6,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,36,245
498684,ppl_99997,act2_448830,2022-08-02,2,type 1,type 2,group 17304,type 2,2022-03-12,type 40,type 25,type 9,type 3,type 8,type 6,type 6,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,36,143
498685,ppl_99997,act2_450133,2022-08-02,2,type 1,type 2,group 17304,type 2,2022-03-12,type 40,type 25,type 9,type 3,type 8,type 6,type 6,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,36,143


In [14]:
type(df_test2['days_difference'][0])

numpy.int64

In [15]:
df_test2.dtypes

people_id                    object
activity_id                  object
date_x               datetime64[ns]
activity_category             int64
activity_char                object
char_1                       object
group_1                      object
char_2                       object
date_y               datetime64[ns]
char_3                       object
char_4                       object
char_5                       object
char_6                       object
char_7                       object
char_8                       object
char_9                       object
char_10                        bool
char_11                        bool
char_12                        bool
char_13                        bool
char_14                        bool
char_15                        bool
char_16                        bool
char_17                        bool
char_18                        bool
char_19                        bool
char_20                        bool
char_21                     

In [16]:
df_test3 = df_test2.copy()
for i in df_test3.any(bool_only = True).index:
    df_test3[i] = df_test3[i].astype(int)
df_test3

Unnamed: 0,people_id,activity_id,date_x,activity_category,activity_char,char_1,group_1,char_2,date_y,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,days_difference
0,ppl_100004,act1_249281,2022-07-20,1,,type 2,group 22593,type 3,2022-07-20,type 40,type 25,type 9,type 4,type 16,type 2,type 2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,76,0
1,ppl_100004,act2_230855,2022-07-20,5,type 682,type 2,group 22593,type 3,2022-07-20,type 40,type 25,type 9,type 4,type 16,type 2,type 2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,76,0
2,ppl_10001,act1_240724,2022-10-14,1,,type 2,group 25417,type 3,2022-10-14,type 6,type 6,type 4,type 1,type 1,type 2,type 2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,0
3,ppl_10001,act1_83552,2022-11-27,1,,type 2,group 25417,type 3,2022-10-14,type 6,type 6,type 4,type 1,type 1,type 2,type 2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,44
4,ppl_10001,act2_1043301,2022-10-15,5,type 3015,type 2,group 25417,type 3,2022-10-14,type 6,type 6,type 4,type 1,type 1,type 2,type 2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498682,ppl_99997,act2_4367092,2023-04-22,2,type 1,type 2,group 17304,type 2,2022-03-12,type 40,type 25,type 9,type 3,type 8,type 6,type 6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,406
498683,ppl_99997,act2_4404220,2022-11-12,2,type 1,type 2,group 17304,type 2,2022-03-12,type 40,type 25,type 9,type 3,type 8,type 6,type 6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,245
498684,ppl_99997,act2_448830,2022-08-02,2,type 1,type 2,group 17304,type 2,2022-03-12,type 40,type 25,type 9,type 3,type 8,type 6,type 6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,143
498685,ppl_99997,act2_450133,2022-08-02,2,type 1,type 2,group 17304,type 2,2022-03-12,type 40,type 25,type 9,type 3,type 8,type 6,type 6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,143


In [17]:
df_test4 = df_test3.drop(['date_x', 'date_y'], axis = 1)
df_test4

Unnamed: 0,people_id,activity_id,activity_category,activity_char,char_1,group_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,days_difference
0,ppl_100004,act1_249281,1,,type 2,group 22593,type 3,type 40,type 25,type 9,type 4,type 16,type 2,type 2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,76,0
1,ppl_100004,act2_230855,5,type 682,type 2,group 22593,type 3,type 40,type 25,type 9,type 4,type 16,type 2,type 2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,76,0
2,ppl_10001,act1_240724,1,,type 2,group 25417,type 3,type 6,type 6,type 4,type 1,type 1,type 2,type 2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,0
3,ppl_10001,act1_83552,1,,type 2,group 25417,type 3,type 6,type 6,type 4,type 1,type 1,type 2,type 2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,44
4,ppl_10001,act2_1043301,5,type 3015,type 2,group 25417,type 3,type 6,type 6,type 4,type 1,type 1,type 2,type 2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498682,ppl_99997,act2_4367092,2,type 1,type 2,group 17304,type 2,type 40,type 25,type 9,type 3,type 8,type 6,type 6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,406
498683,ppl_99997,act2_4404220,2,type 1,type 2,group 17304,type 2,type 40,type 25,type 9,type 3,type 8,type 6,type 6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,245
498684,ppl_99997,act2_448830,2,type 1,type 2,group 17304,type 2,type 40,type 25,type 9,type 3,type 8,type 6,type 6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,143
498685,ppl_99997,act2_450133,2,type 1,type 2,group 17304,type 2,type 40,type 25,type 9,type 3,type 8,type 6,type 6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,143


In [18]:
chars = ['char_1','char_2','char_3','char_4','char_5','char_6','char_7','char_8','char_9', 'group_1']
for i in chars:
    df_test4[i] = df_test4[i].apply(convert_str_to_num)
df_test4

Unnamed: 0,people_id,activity_id,activity_category,activity_char,char_1,group_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,days_difference
0,ppl_100004,act1_249281,1,,2,22593,3,40,25,9,4,16,2,2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,76,0
1,ppl_100004,act2_230855,5,type 682,2,22593,3,40,25,9,4,16,2,2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,76,0
2,ppl_10001,act1_240724,1,,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,0
3,ppl_10001,act1_83552,1,,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,44
4,ppl_10001,act2_1043301,5,type 3015,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498682,ppl_99997,act2_4367092,2,type 1,2,17304,2,40,25,9,3,8,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,406
498683,ppl_99997,act2_4404220,2,type 1,2,17304,2,40,25,9,3,8,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,245
498684,ppl_99997,act2_448830,2,type 1,2,17304,2,40,25,9,3,8,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,143
498685,ppl_99997,act2_450133,2,type 1,2,17304,2,40,25,9,3,8,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,143


In [19]:
def convert_str_to_num_(x):
    t = x.split('_')
    return int(float(t[1]))

df_test4['people_id'] = df_test4['people_id'].apply(convert_str_to_num_)
df_test4

Unnamed: 0,people_id,activity_id,activity_category,activity_char,char_1,group_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,days_difference
0,100004,act1_249281,1,,2,22593,3,40,25,9,4,16,2,2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,76,0
1,100004,act2_230855,5,type 682,2,22593,3,40,25,9,4,16,2,2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,76,0
2,10001,act1_240724,1,,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,0
3,10001,act1_83552,1,,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,44
4,10001,act2_1043301,5,type 3015,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498682,99997,act2_4367092,2,type 1,2,17304,2,40,25,9,3,8,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,406
498683,99997,act2_4404220,2,type 1,2,17304,2,40,25,9,3,8,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,245
498684,99997,act2_448830,2,type 1,2,17304,2,40,25,9,3,8,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,143
498685,99997,act2_450133,2,type 1,2,17304,2,40,25,9,3,8,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,143


In [20]:
df_test4.dtypes

people_id             int64
activity_id          object
activity_category     int64
activity_char        object
char_1                int64
group_1               int64
char_2                int64
char_3                int64
char_4                int64
char_5                int64
char_6                int64
char_7                int64
char_8                int64
char_9                int64
char_10               int64
char_11               int64
char_12               int64
char_13               int64
char_14               int64
char_15               int64
char_16               int64
char_17               int64
char_18               int64
char_19               int64
char_20               int64
char_21               int64
char_22               int64
char_23               int64
char_24               int64
char_25               int64
char_26               int64
char_27               int64
char_28               int64
char_29               int64
char_30               int64
char_31             

In [21]:
is_NaN = df_test4.isnull()
row_has_NaN = is_NaN.any(axis=1)
df_test5_null = df_test4[row_has_NaN].reset_index()
null_index = np.array(df_test5_null['index'])
df_test5_null = df_test5_null.drop(['index', 'activity_char'], axis=1)
activity_id_null = df_test5_null['activity_id']
df_test5_null['activity_id'] = df_test5_null['activity_id'].apply(convert_str_to_num_)
df_test5_null

Unnamed: 0,people_id,activity_id,activity_category,char_1,group_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,days_difference
0,100004,249281,1,2,22593,3,40,25,9,4,16,2,2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,76,0
1,10001,240724,1,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,0
2,10001,83552,1,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,44
3,100010,218751,1,2,17304,2,8,7,8,1,7,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0
4,100010,383524,1,2,17304,2,8,7,8,1,7,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40087,99881,250479,1,2,17304,2,26,6,8,3,9,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
40088,99899,412734,1,2,47052,3,15,6,6,5,15,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,72,0
40089,99934,85735,1,2,8831,3,40,25,9,1,1,4,4,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,53,86
40090,99956,352909,1,2,20306,3,4,8,5,5,15,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,90,1


In [22]:
df_test5_non_null = df_test4.drop(labels=null_index, axis=0).reset_index()
non_null_index = np.array(df_test5_non_null['index'])
df_test5_non_null = df_test5_non_null.drop('index', axis=1)
df_test5_non_null['activity_char'] = df_test5_non_null['activity_char'].apply(convert_str_to_num)
activity_id_non_null = df_test5_non_null['activity_id']
df_test5_non_null['activity_id'] = df_test5_non_null['activity_id'].apply(convert_str_to_num_)
df_test5_non_null

Unnamed: 0,people_id,activity_id,activity_category,activity_char,char_1,group_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,days_difference
0,100004,230855,5,682,2,22593,3,40,25,9,4,16,2,2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,76,0
1,10001,1043301,5,3015,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,1
2,10001,112890,5,4987,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,44
3,10001,1169930,5,3015,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,1
4,10001,1924448,5,3015,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458590,99997,4367092,2,1,2,17304,2,40,25,9,3,8,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,406
458591,99997,4404220,2,1,2,17304,2,40,25,9,3,8,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,245
458592,99997,448830,2,1,2,17304,2,40,25,9,3,8,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,143
458593,99997,450133,2,1,2,17304,2,40,25,9,3,8,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,143


## For non_null dataframe

In [23]:
x_test = df_test5_non_null
x_test

Unnamed: 0,people_id,activity_id,activity_category,activity_char,char_1,group_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,days_difference
0,100004,230855,5,682,2,22593,3,40,25,9,4,16,2,2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,76,0
1,10001,1043301,5,3015,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,1
2,10001,112890,5,4987,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,44
3,10001,1169930,5,3015,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,1
4,10001,1924448,5,3015,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458590,99997,4367092,2,1,2,17304,2,40,25,9,3,8,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,406
458591,99997,4404220,2,1,2,17304,2,40,25,9,3,8,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,245
458592,99997,448830,2,1,2,17304,2,40,25,9,3,8,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,143
458593,99997,450133,2,1,2,17304,2,40,25,9,3,8,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,143


In [24]:
model_non_null = joblib.load("/content/drive/MyDrive/B.Tech/kaggle/predicting-red-hat-business-value/model_non_null.joblib")

In [25]:
y_pred = model_non_null.predict(x_test)
y_pred

array([1, 1, 1, ..., 0, 0, 0])

In [26]:
df1 = pd.DataFrame({'index': non_null_index, 'activity_id': activity_id_non_null, 'outcome': y_pred})
df1

Unnamed: 0,index,activity_id,outcome
0,1,act2_230855,1
1,4,act2_1043301,1
2,5,act2_112890,1
3,6,act2_1169930,1
4,7,act2_1924448,1
...,...,...,...
458590,498682,act2_4367092,0
458591,498683,act2_4404220,0
458592,498684,act2_448830,0
458593,498685,act2_450133,0


## For null dataframe

In [27]:
x_test = df_test5_null
x_test

Unnamed: 0,people_id,activity_id,activity_category,char_1,group_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,days_difference
0,100004,249281,1,2,22593,3,40,25,9,4,16,2,2,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,76,0
1,10001,240724,1,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,0
2,10001,83552,1,2,25417,3,6,6,4,1,1,2,2,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,90,44
3,100010,218751,1,2,17304,2,8,7,8,1,7,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0
4,100010,383524,1,2,17304,2,8,7,8,1,7,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40087,99881,250479,1,2,17304,2,26,6,8,3,9,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
40088,99899,412734,1,2,47052,3,15,6,6,5,15,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,72,0
40089,99934,85735,1,2,8831,3,40,25,9,1,1,4,4,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,53,86
40090,99956,352909,1,2,20306,3,4,8,5,5,15,2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,90,1


In [28]:
model_null = joblib.load("/content/drive/MyDrive/B.Tech/kaggle/predicting-red-hat-business-value/model_null.joblib")

In [29]:
y_pred = model_null.predict(x_test)
y_pred

array([0, 1, 1, ..., 1, 0, 1])

In [30]:
df2 = pd.DataFrame({'index': null_index, 'activity_id': activity_id_null, 'outcome': y_pred})
df2

Unnamed: 0,index,activity_id,outcome
0,0,act1_249281,0
1,2,act1_240724,1
2,3,act1_83552,1
3,27,act1_218751,0
4,28,act1_383524,0
...,...,...,...
40087,498620,act1_250479,0
40088,498626,act1_412734,1
40089,498635,act1_85735,1
40090,498648,act1_352909,0


In [31]:
output = pd.concat([df1, df2])
output = output.sort_values(by='index')
output = output.drop('index', axis=1)
output

Unnamed: 0,activity_id,outcome
0,act1_249281,0
0,act2_230855,1
1,act1_240724,1
2,act1_83552,1
1,act2_1043301,1
...,...,...
458590,act2_4367092,0
458591,act2_4404220,0
458592,act2_448830,0
458593,act2_450133,0


In [32]:
output.to_csv('/content/drive/MyDrive/B.Tech/kaggle/predicting-red-hat-business-value/my_submission_1.csv', index=False)