In [1]:
import pandas as pd
import numpy as np

In [53]:
df_1 = pd.read_csv('Data/Questions_Answers.csv')
df_2 = pd.read_csv('Data/User_Question_Answer.csv', low_memory=False)

In [61]:
# Context manager to temporarily set options in the with statement context
def display_all(df):
    with pd.option_context('display.max_rows', 100000, 'display.max_columns', 100000):
        display(df)

## Preprocessing

In [100]:
df_2.head()

Unnamed: 0,user_id,question_id,answer_id,datetimestamp
0,10109,21045,60570,2018-09-06 16:18:02
1,10135,21045,60571,2018-09-06 15:20:57
2,10237,21045,60571,2018-09-06 13:30:05
3,10261,21045,60570,2018-09-06 14:40:45
4,10268,21045,60571,2018-09-06 15:06:17


In [59]:
len(df_2), df_2.user_id.nunique()

(512410, 7243)

In [60]:
df_2['datetimestamp'].head()

0    2018-09-06 16:18:02
1    2018-09-06 15:20:57
2    2018-09-06 13:30:05
3    2018-09-06 14:40:45
4    2018-09-06 15:06:17
Name: datetimestamp, dtype: object

In [76]:
# Check if any row contains a null
df_2[df_2['datetimestamp'].isnull()]

Unnamed: 0,user_id,question_id,answer_id,datetimestamp
119871,82778,6,2018-09-12 19:09:06,


In [82]:
df_2[df_2['datetimestamp'] == '2018-09-17 14:56:5,61898,2018-09-15 21:21:00"']

Unnamed: 0,user_id,question_id,answer_id,datetimestamp
291489,92579,21545,61865,"2018-09-17 14:56:5,61898,2018-09-15 21:21:00"""


In [85]:
df_2[df_2['datetimestamp'] == '2018-09-18 13:0:\r\n74517,21562,61910,2018-09-16 07:01:52"']

Unnamed: 0,user_id,question_id,answer_id,datetimestamp
294320,75880,21559,61902,"2018-09-18 13:0:\r\n74517,21562,61910,2018-09-..."


In [77]:
# Dropped the above corrupt rows
df_2.drop(index=119871, inplace=True)
df_2.drop(index=291489, inplace=True)
df_2.drop(index=294320, inplace=True)

In [87]:
# Convert datetimestamp from object type to datetime type
df_2['datetimestamp'] = pd.to_datetime(df_2['datetimestamp'])

In [88]:
df_2.dtypes

user_id                   int64
question_id               int64
answer_id                object
datetimestamp    datetime64[ns]
dtype: object

In [89]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 512407 entries, 0 to 512409
Data columns (total 4 columns):
user_id          512407 non-null int64
question_id      512407 non-null int64
answer_id        509801 non-null object
datetimestamp    512407 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 19.5+ MB


In [98]:
# Dropped rows in null answers for now, although in real model this might be very important feature
df_2.dropna(inplace=True)

In [99]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 509801 entries, 0 to 512409
Data columns (total 4 columns):
user_id          509801 non-null int64
question_id      509801 non-null int64
answer_id        509801 non-null object
datetimestamp    509801 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 19.4+ MB


In [108]:
df_2.head()

Unnamed: 0,user_id,question_id,answer_id,datetimestamp
0,10109,21045,60570,2018-09-06 16:18:02
1,10135,21045,60571,2018-09-06 15:20:57
2,10237,21045,60571,2018-09-06 13:30:05
3,10261,21045,60570,2018-09-06 14:40:45
4,10268,21045,60571,2018-09-06 15:06:17


> df_2 is clean and ready to be processed in feature engineering

In [102]:
df_1.head()

Unnamed: 0,question_id,title,start_age,end_age,sex_id,category_id,nb_targets,nb_points,created_at,updated_at,topic_id,prefered_lang,rank,Total_Number_of_Answers,answer_id,description,Number_Of_Answers
0,21045,Are you trying to lose weight?,10,80,0.0,7,3000,1,2018-09-06 01:23:42,2018-09-06 15:12:03,0,0.0,0,3000,60570,Yes,1943
1,21045,Are you trying to lose weight?,10,80,0.0,7,3000,1,2018-09-06 01:23:42,2018-09-06 15:12:03,0,0.0,0,3000,60571,No,1057
2,21104,Which water brand do you prefer?,10,80,0.0,12,2000,1,2018-09-07 12:51:52,2018-09-07 12:55:01,0,,1,2000,60724,Sohat,698
3,21104,Which water brand do you prefer?,10,80,0.0,12,2000,1,2018-09-07 12:51:52,2018-09-07 12:55:01,0,,1,2000,60725,Rim,337
4,21104,Which water brand do you prefer?,10,80,0.0,12,2000,1,2018-09-07 12:51:52,2018-09-07 12:55:01,0,,1,2000,60726,Sannine,139


In [13]:
df_1[df_1.question_id == 21045]

Unnamed: 0,question_id,title,start_age,end_age,sex_id,category_id,nb_targets,nb_points,created_at,updated_at,topic_id,prefered_lang,rank,Total_Number_of_Answers,answer_id,description,Number_Of_Answers
0,21045,Are you trying to lose weight?,10,80,0.0,7,3000,1,2018-09-06 01:23:42,2018-09-06 15:12:03,0,0.0,0,3000,60570,Yes,1943
1,21045,Are you trying to lose weight?,10,80,0.0,7,3000,1,2018-09-06 01:23:42,2018-09-06 15:12:03,0,0.0,0,3000,60571,No,1057


In [103]:
df_1[df_1.question_id == 21104]

Unnamed: 0,question_id,title,start_age,end_age,sex_id,category_id,nb_targets,nb_points,created_at,updated_at,topic_id,prefered_lang,rank,Total_Number_of_Answers,answer_id,description,Number_Of_Answers
2,21104,Which water brand do you prefer?,10,80,0.0,12,2000,1,2018-09-07 12:51:52,2018-09-07 12:55:01,0,,1,2000,60724,Sohat,698
3,21104,Which water brand do you prefer?,10,80,0.0,12,2000,1,2018-09-07 12:51:52,2018-09-07 12:55:01,0,,1,2000,60725,Rim,337
4,21104,Which water brand do you prefer?,10,80,0.0,12,2000,1,2018-09-07 12:51:52,2018-09-07 12:55:01,0,,1,2000,60726,Sannine,139
5,21104,Which water brand do you prefer?,10,80,0.0,12,2000,1,2018-09-07 12:51:52,2018-09-07 12:55:01,0,,1,2000,60727,Nestle Pure Water,417
6,21104,Which water brand do you prefer?,10,80,0.0,12,2000,1,2018-09-07 12:51:52,2018-09-07 12:55:01,0,,1,2000,60728,Tannourine,409


In [143]:
# For now we will only these columns 
df_q = df_1[['question_id', 'category_id', 'created_at', 'nb_points', 'sex_id']]

In [144]:
df_q.question_id.nunique()

235

In [145]:
df_q = df_q.drop_duplicates().copy()

In [146]:
df_q.reset_index(drop=True, inplace=True)

In [148]:
df_q.head()

Unnamed: 0,question_id,category_id,created_at,nb_points,sex_id
0,21045,7,2018-09-06 01:23:42,1,0.0
1,21104,12,2018-09-07 12:51:52,1,0.0
2,21106,10,2018-09-07 15:03:05,0,2.0
3,21107,10,2018-09-07 15:07:58,0,0.0
4,21108,10,2018-09-07 15:10:56,0,2.0


In [150]:
df_q.dtypes

question_id      int64
category_id      int64
created_at      object
nb_points        int64
sex_id         float64
dtype: object

In [151]:
# Convert created_at from object type to datetime type
df_q['created_at'] = pd.to_datetime(df_q['created_at'])

In [175]:
df_q.head()

Unnamed: 0,question_id,category_id,created_at,nb_points,sex_id
0,21045,7,2018-09-06 01:23:42,1,0.0
1,21104,12,2018-09-07 12:51:52,1,0.0
2,21106,10,2018-09-07 15:03:05,0,2.0
3,21107,10,2018-09-07 15:07:58,0,0.0
4,21108,10,2018-09-07 15:10:56,0,2.0


In [152]:
df_q.dtypes

question_id             int64
category_id             int64
created_at     datetime64[ns]
nb_points               int64
sex_id                float64
dtype: object

In [153]:
df_q.isnull().sum()

question_id    0
category_id    0
created_at     0
nb_points      0
sex_id         4
dtype: int64

In [154]:
df_q.sex_id.value_counts()

0.0    205
2.0     17
1.0      9
Name: sex_id, dtype: int64

In [156]:
# Fill nulls with the highest value count above
df_q.sex_id.fi llna(0, inplace=True)

> df_q is clean and ready fpr processing

## Merge two dataframes for one dataframe for feature engineering

In [230]:
len(df_2), len(df_q)

(509801, 235)

In [231]:
df_2.columns

Index(['user_id', 'question_id', 'answer_id', 'datetimestamp'], dtype='object')

In [232]:
df_q.columns

Index(['question_id', 'category_id', 'created_at', 'nb_points', 'sex_id'], dtype='object')

In [233]:
df = df_2.merge(df_q, on='question_id')

In [234]:
len(df)

509801

In [235]:
df.tail(15)

Unnamed: 0,user_id,question_id,answer_id,datetimestamp,category_id,created_at,nb_points,sex_id
509786,109229,21711,62311,2018-09-20 09:19:14,18,2018-09-19 22:45:10,1,0.0
509787,109231,21711,62311,2018-09-20 10:04:14,18,2018-09-19 22:45:10,1,0.0
509788,109240,21711,62311,2018-09-20 09:53:22,18,2018-09-19 22:45:10,1,0.0
509789,109251,21711,62311,2018-09-20 10:24:24,18,2018-09-19 22:45:10,1,0.0
509790,109264,21711,62311,2018-09-20 10:28:26,18,2018-09-19 22:45:10,1,0.0
509791,109265,21711,62311,2018-09-20 09:57:22,18,2018-09-19 22:45:10,1,0.0
509792,109268,21711,62311,2018-09-20 10:16:59,18,2018-09-19 22:45:10,1,0.0
509793,109269,21711,62311,2018-09-20 09:50:57,18,2018-09-19 22:45:10,1,0.0
509794,109273,21711,62310,2018-09-20 11:20:02,18,2018-09-19 22:45:10,1,0.0
509795,109278,21711,62310,2018-09-20 10:08:59,18,2018-09-19 22:45:10,1,0.0


In [236]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 509801 entries, 0 to 509800
Data columns (total 8 columns):
user_id          509801 non-null int64
question_id      509801 non-null int64
answer_id        509801 non-null object
datetimestamp    509801 non-null datetime64[ns]
category_id      509801 non-null int64
created_at       509801 non-null datetime64[ns]
nb_points        509801 non-null int64
sex_id           509801 non-null float64
dtypes: datetime64[ns](2), float64(1), int64(4), object(1)
memory usage: 35.0+ MB


In [237]:
df.category_id = df.category_id.astype('category')
df.sex_id = df.sex_id.astype('category')
df.answer_id = df.answer_id.astype('category')

In [238]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 509801 entries, 0 to 509800
Data columns (total 8 columns):
user_id          509801 non-null int64
question_id      509801 non-null int64
answer_id        509801 non-null category
datetimestamp    509801 non-null datetime64[ns]
category_id      509801 non-null category
created_at       509801 non-null datetime64[ns]
nb_points        509801 non-null int64
sex_id           509801 non-null category
dtypes: category(3), datetime64[ns](2), int64(3)
memory usage: 25.3 MB


## Feature Engineering

#### 1. Time difference between question and answer by user

In [239]:
df['diff_time'] = (df['created_at'] - df['datetimestamp']).astype('timedelta64[h]')

In [240]:
df['diff_time'] = df['diff_time'].abs()

In [241]:
df.head()

Unnamed: 0,user_id,question_id,answer_id,datetimestamp,category_id,created_at,nb_points,sex_id,diff_time
0,10109,21045,60570,2018-09-06 16:18:02,7,2018-09-06 01:23:42,1,0.0,15.0
1,10135,21045,60571,2018-09-06 15:20:57,7,2018-09-06 01:23:42,1,0.0,14.0
2,10237,21045,60571,2018-09-06 13:30:05,7,2018-09-06 01:23:42,1,0.0,13.0
3,10261,21045,60570,2018-09-06 14:40:45,7,2018-09-06 01:23:42,1,0.0,14.0
4,10268,21045,60571,2018-09-06 15:06:17,7,2018-09-06 01:23:42,1,0.0,14.0


In [242]:
# Grouoing by reposnse time to question on app
# Worst Users
df.groupby('user_id').diff_time.mean().sort_values(ascending=False).head(20)

user_id
17067     301.571429
105961    290.500000
100494    290.000000
27546     270.000000
12468     235.333333
109201    234.200000
87931     232.428571
100593    199.000000
89703     198.166667
89745     195.916667
73854     195.666667
91049     194.083333
106638    193.916667
76005     193.500000
48799     192.153846
104309    183.692308
107158    183.312500
103347    183.250000
96001     182.611111
104876    182.375000
Name: diff_time, dtype: float64

In [243]:
# Best users
df.groupby('user_id').diff_time.mean().sort_values(ascending=False).tail(20)

user_id
105596    1.0
108150    1.0
107378    1.0
106908    1.0
105465    1.0
105841    1.0
83757     1.0
89062     1.0
104867    1.0
107376    1.0
108330    1.0
77509     1.0
108328    1.0
108327    1.0
108326    1.0
108277    1.0
94696     1.0
95398     1.0
28878     1.0
108270    1.0
Name: diff_time, dtype: float64

In [222]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 509801 entries, 0 to 509800
Data columns (total 9 columns):
user_id          509801 non-null int64
question_id      509801 non-null int64
answer_id        509801 non-null object
datetimestamp    509801 non-null datetime64[ns]
category_id      509801 non-null int64
created_at       509801 non-null datetime64[ns]
nb_points        509801 non-null int64
sex_id           509801 non-null float64
diff_time        509801 non-null float64
dtypes: datetime64[ns](2), float64(2), int64(4), object(1)
memory usage: 38.9+ MB


## Training Basic Model

# Treating every question as a feature