# 2019 Data Science Bowl - EDA

## Load Data & Libraries

In [1]:
import pandas as pd
import seaborn as sns

In [2]:
sample_submission = pd.read_csv(r'D:\data-science-bowl-2019\sample_submission.csv')
specs = pd.read_csv(r'D:\data-science-bowl-2019\specs.csv')
test = pd.read_csv(r'D:\data-science-bowl-2019\test.csv')
train = pd.read_csv(r'D:\data-science-bowl-2019\train.csv')
train_labels = pd.read_csv(r'D:\data-science-bowl-2019\train_labels.csv')

## Finding relationships between `train` and `train_labels`

Matching columns from both DataFrame

In [12]:
train_keys = train.columns.intersection(train_labels.columns).to_list()
train_keys

['game_session', 'installation_id', 'title']

Number of unique values in `train` vs. `train_labels`

In [38]:
df = pd.concat([train[train_keys].nunique(), 
                train_labels[train_keys].nunique()], axis=1)
df.columns = ['train', 'train_labels']
df

Unnamed: 0,train,train_labels
game_session,303319,17690
installation_id,17000,3614
title,44,5


`train` `installation_id`s that are labelled

In [55]:
train_labelled = train[train['installation_id'].isin(train_labels['installation_id'])].copy()
train_labelled.reset_index(drop=True, inplace=True)
train_labelled

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,34ba1a28d02ba8ba,2019-08-06T04:57:18.904Z,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,4b57c9a59474a1b9,2019-08-06T04:57:45.301Z,"{""event_code"": 2000, ""event_count"": 1}",0006a69f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,2b9d5af79bcdb79f,2019-08-06T04:58:14.538Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,2b9d5af79bcdb79f,2019-08-06T04:58:14.615Z,"{""description"":""Let's build a sandcastle! Firs...",0006a69f,2,3010,29,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1325467d,2b9d5af79bcdb79f,2019-08-06T04:58:16.680Z,"{""coordinates"":{""x"":273,""y"":650,""stage_width"":...",0006a69f,3,4070,2137,Sandcastle Builder (Activity),Activity,MAGMAPEAK
...,...,...,...,...,...,...,...,...,...,...,...
7734553,28520915,5448d652309a6324,2019-09-22T02:07:27.562Z,"{""misses"":1,""prompt"":""holds least"",""mode"":""sel...",ffeb0b1b,58,2030,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK
7734554,91561152,5448d652309a6324,2019-09-22T02:07:27.562Z,"{""bucket"":1,""buckets_placed"":[3,1,2],""target_b...",ffeb0b1b,57,4025,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK
7734555,d3268efa,5448d652309a6324,2019-09-22T02:07:27.566Z,"{""description"":""Awesome."",""identifier"":""Dot_Aw...",ffeb0b1b,59,3021,67094,Cauldron Filler (Assessment),Assessment,MAGMAPEAK
7734556,b5053438,5448d652309a6324,2019-09-22T02:07:28.311Z,"{""description"":""Awesome."",""identifier"":""Dot_Aw...",ffeb0b1b,60,3121,67847,Cauldron Filler (Assessment),Assessment,MAGMAPEAK


In [56]:
train_labelled['timestamp'] = pd.to_datetime(train_labelled['timestamp'])

In [79]:
print(train_labelled.nunique())
print(train_labelled.shape)

event_id               379
game_session        175467
timestamp          7347901
event_data         7457863
installation_id       3614
event_count           3182
event_code              42
game_time           673592
title                   44
type                     4
world                    4
dtype: int64
(7734558, 11)


In [80]:
print(train_labels.nunique())
print(train_labels.shape)

game_session       17690
installation_id     3614
title                  5
num_correct            2
num_incorrect         46
accuracy              30
accuracy_group         4
dtype: int64
(17690, 7)


In [None]:
df = pd.concat([train[train_keys].nunique(), 
                train_labels[train_keys].nunique()], axis=1)
df.columns = ['train', 'train_labels']
df

## Building `train2`
Merge of `train_labelled` & `train_labels`

In [73]:
train2 = pd.merge(train_labelled, train_labels)
train2.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,num_correct,num_incorrect,accuracy,accuracy_group
0,3bfd1a65,901acc108f55a5a1,2019-08-06 05:22:01.344000+00:00,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0006a69f,1,2000,0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,1.0,3
1,db02c830,901acc108f55a5a1,2019-08-06 05:22:01.400000+00:00,"{""event_count"":2,""game_time"":37,""event_code"":2...",0006a69f,2,2025,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,1.0,3
2,a1e4395d,901acc108f55a5a1,2019-08-06 05:22:01.403000+00:00,"{""description"":""Pull three mushrooms out of th...",0006a69f,3,3010,37,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,1.0,3
3,a52b92d5,901acc108f55a5a1,2019-08-06 05:22:05.242000+00:00,"{""description"":""Pull three mushrooms out of th...",0006a69f,4,3110,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,1.0,3
4,a1e4395d,901acc108f55a5a1,2019-08-06 05:22:05.244000+00:00,"{""description"":""To pick a mushroom, pull it ou...",0006a69f,5,3010,3901,Mushroom Sorter (Assessment),Assessment,TREETOPCITY,1,0,1.0,3


In [86]:
train2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 865447 entries, 0 to 865446
Data columns (total 15 columns):
event_id           865447 non-null object
game_session       865447 non-null object
timestamp          865447 non-null datetime64[ns, UTC]
event_data         865447 non-null object
installation_id    865447 non-null object
event_count        865447 non-null int64
event_code         865447 non-null int64
game_time          865447 non-null int64
title              865447 non-null object
type               865447 non-null object
world              865447 non-null object
num_correct        865447 non-null int64
num_incorrect      865447 non-null int64
accuracy           865447 non-null float64
accuracy_group     865447 non-null int64
dtypes: datetime64[ns, UTC](1), float64(1), int64(6), object(7)
memory usage: 105.6+ MB


In [92]:
temp = train2.drop(columns=['game_session', 'event_data', 'timestamp'])

MemoryError: Unable to allocate array with shape (3614, 865447) and data type uint8

In [None]:
train2.to_csv(r'D:\data-science-bowl-2019\train2.csv')

## References

- https://www.kaggle.com/robikscube/2019-data-science-bowl-an-introduction