# 2019 Data Science Bowl - EDA

## Load Data & Libraries

In [10]:
import pandas as pd
import seaborn as sns

In [11]:
sample_submission = pd.read_csv(r'D:\data-science-bowl-2019\sample_submission.csv')
specs = pd.read_csv(r'D:\data-science-bowl-2019\specs.csv')
test = pd.read_csv(r'D:\data-science-bowl-2019\test.csv')
train = pd.read_csv(r'D:\data-science-bowl-2019\train.csv')
train_labels = pd.read_csv(r'D:\data-science-bowl-2019\train_labels.csv')

## Finding relationships between `train` and `train_labels`

Matching columns from both DataFrame

In [12]:
train_keys = train.columns.intersection(train_labels.columns).to_list()
train_keys

['game_session', 'installation_id', 'title']

Number of unique values in `train` vs. `train_labels`

In [18]:
df = pd.concat([train[train_keys].nunique(), 
                train_labels[train_keys].nunique()], axis=1)
df.columns = ['train', 'train_labels']
df

Unnamed: 0,train,train_labels
game_session,303319,17690
installation_id,17000,3614
title,44,5


`train` that are labelled

In [27]:
train_labelled = train[train['game_session'].isin(train_labels['game_session'])].copy()
train_labelled.reset_index(drop=True, inplace=True)
train_labelled

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,3bfd1a65,901acc108f55a5a1,2019-08-06T05:22:01.344Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
1,db02c830,901acc108f55a5a1,2019-08-06T05:22:01.400Z,"{""event_count"":2,""game_time"":37,""event_code"":2...",0006a69f,2,2025,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
2,a1e4395d,901acc108f55a5a1,2019-08-06T05:22:01.403Z,"{""description"":""Pull three mushrooms out of th...",0006a69f,3,3010,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
3,a52b92d5,901acc108f55a5a1,2019-08-06T05:22:05.242Z,"{""description"":""Pull three mushrooms out of th...",0006a69f,4,3110,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
4,a1e4395d,901acc108f55a5a1,2019-08-06T05:22:05.244Z,"{""description"":""To pick a mushroom, pull it ou...",0006a69f,5,3010,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
...,...,...,...,...,...,...,...,...,...,...,...
865442,28520915,5448d652309a6324,2019-09-22T02:07:27.562Z,"{""misses"":1,""prompt"":""holds least"",""mode"":""sel...",ffeb0b1b,58,2030,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK
865443,91561152,5448d652309a6324,2019-09-22T02:07:27.562Z,"{""bucket"":1,""buckets_placed"":[3,1,2],""target_b...",ffeb0b1b,57,4025,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK
865444,d3268efa,5448d652309a6324,2019-09-22T02:07:27.566Z,"{""description"":""Awesome."",""identifier"":""Dot_Aw...",ffeb0b1b,59,3021,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK
865445,b5053438,5448d652309a6324,2019-09-22T02:07:28.311Z,"{""description"":""Awesome."",""identifier"":""Dot_Aw...",ffeb0b1b,60,3121,67847,Cauldron Filler (Assessment),Assessment,MAGMAPEAK


In [19]:
train_labelled.nunique()

event_id               97
game_session        17690
timestamp          842415
event_data         805123
installation_id      3614
event_count          1274
event_code             22
game_time          135432
title                   5
type                    1
world                   3
dtype: int64

In [20]:
train_labelled['timestamp'] = pd.to_datetime(train_labelled['timestamp'])

In [29]:
df = pd.concat([train[train_keys].nunique(), 
                train_labels[train_keys].nunique()], axis=1)
df.columns = ['train', 'train_labels']
df

Unnamed: 0,train,train_labels
game_session,303319,17690
installation_id,17000,3614
title,44,5


In [None]:
if num_correct == 0:
    return 0

if num_incorrect == 0:
    return 3

if num_incorrect == 1:
    return 2

if num_incorrect == 2:
    return 1

## Building `train2`
Merge of `train_labelled` & `train_labels`

In [73]:
train2 = pd.merge(train_labelled, train_labels)
train2.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,num_correct,num_incorrect,accuracy,accuracy_group
0,3bfd1a65,901acc108f55a5a1,2019-08-06 05:22:01.344000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,1.0,3
1,db02c830,901acc108f55a5a1,2019-08-06 05:22:01.400000+00:00,"{""event_count"":2,""game_time"":37,""event_code"":2...",0006a69f,2,2025,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,1.0,3
2,a1e4395d,901acc108f55a5a1,2019-08-06 05:22:01.403000+00:00,"{""description"":""Pull three mushrooms out of th...",0006a69f,3,3010,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,1.0,3
3,a52b92d5,901acc108f55a5a1,2019-08-06 05:22:05.242000+00:00,"{""description"":""Pull three mushrooms out of th...",0006a69f,4,3110,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,1.0,3
4,a1e4395d,901acc108f55a5a1,2019-08-06 05:22:05.244000+00:00,"{""description"":""To pick a mushroom, pull it ou...",0006a69f,5,3010,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,1.0,3


In [86]:
train2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 865447 entries, 0 to 865446
Data columns (total 15 columns):
event_id           865447 non-null object
game_session       865447 non-null object
timestamp          865447 non-null datetime64[ns, UTC]
event_data         865447 non-null object
installation_id    865447 non-null object
event_count        865447 non-null int64
event_code         865447 non-null int64
game_time          865447 non-null int64
title              865447 non-null object
type               865447 non-null object
world              865447 non-null object
num_correct        865447 non-null int64
num_incorrect      865447 non-null int64
accuracy           865447 non-null float64
accuracy_group     865447 non-null int64
dtypes: datetime64[ns, UTC](1), float64(1), int64(6), object(7)
memory usage: 105.6+ MB


In [98]:
train2_csv = train2.drop(columns=['game_session', 'event_data', 'timestamp'])

In [99]:
train2_csv.to_csv(r'D:\data-science-bowl-2019\train2.csv')

## References

- https://www.kaggle.com/robikscube/2019-data-science-bowl-an-introduction