# Предобработка данных

1. Удаление записей с датой 1970-01-01
2. Заполнение пропусков значением "missing"
3. Рассчет местного времени
4. Генерация признаков месяц, дата, день недели, час, минута (по местному времени)
5. Все что можно переведено в int8 
6. Разбиение выборки на трейн(первые 2 недели) / тест(последняя неделя) 
7. Сохранение в пиклы

In [1]:
import pickle
import numpy as np
import pandas as pd
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

# Загрузка данных

In [2]:
%%time
data = pd.read_csv('../data/raw/train.csv', parse_dates=['created'])
display(data.head())
print(data.shape)
display(data.info())

Unnamed: 0,Segment,gamecategory,subgamecategory,bundle,created,shift,oblast,city,os,osv
0,4,Games,Racing,com.MadOut.BIG,2021-07-05 18:07:40,MSK+6,Забайкальский Край,Чита,android,10.0
1,4,,,com.easybrain.solitaire.klondike.free,2021-07-10 10:38:42,MSK+2,Оренбургская область,Оренбург,Android,10.0.0
2,5,Games,Arcade,com.orbitalknight.ridiculousfreekick,2021-08-04 13:34:29,MSK,Санкт-Петербург,Санкт-Петербург,android,9.0
3,5,,,tcouchgind.scooterextreme.scooter,2021-08-06 07:35:27,MSK+2,Свердловская область,Екатеринбург,android,9
4,4,,,com.FidgetTrading3D.game,2021-08-02 20:43:59,MSK,Московская область,Звенигород,android,6.0.1


(44854516, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44854516 entries, 0 to 44854515
Data columns (total 10 columns):
 #   Column           Dtype         
---  ------           -----         
 0   Segment          int64         
 1   gamecategory     object        
 2   subgamecategory  object        
 3   bundle           object        
 4   created          datetime64[ns]
 5   shift            object        
 6   oblast           object        
 7   city             object        
 8   os               object        
 9   osv              object        
dtypes: datetime64[ns](1), int64(1), object(8)
memory usage: 3.3+ GB


None

Wall time: 1min 43s


# Анализ

In [3]:
display(data.head(2))

Unnamed: 0,Segment,gamecategory,subgamecategory,bundle,created,shift,oblast,city,os,osv
0,4,Games,Racing,com.MadOut.BIG,2021-07-05 18:07:40,MSK+6,Забайкальский Край,Чита,android,10.0
1,4,,,com.easybrain.solitaire.klondike.free,2021-07-10 10:38:42,MSK+2,Оренбургская область,Оренбург,Android,10.0.0


## Проверка пропусков в данных

In [4]:
data.isna().sum()

Segment                   0
gamecategory       16961331
subgamecategory    16968425
bundle                17284
created                   0
shift               3591150
oblast              3455278
city                4799992
os                      233
osv                     283
dtype: int64

## Количество значений в признаках

In [5]:
data['Segment'].value_counts()

5    17187506
3    14187054
4    11142080
2     1416245
1      921631
Name: Segment, dtype: int64

In [6]:
data['gamecategory'].value_counts()

Games                 24555534
Applications           2742231
Lifestyle               267292
Shopping                184875
Entertainment            31465
Social Networking        28931
Education                27008
Book                     23483
Photo & Video            10235
Utilities                 7560
Music                     4571
IAB9-30                   3474
Health & Fitness          1423
Finance                   1322
Productivity              1314
Travel                     778
Sports                     614
Navigation                 332
Reference                  235
IAB9,IAB9-30,games         226
Graphics & Design           73
Business                    67
Medical                     66
Food & Drink                44
News                        23
Weather                      5
Stickers                     4
Name: gamecategory, dtype: int64

In [7]:
data['subgamecategory'].value_counts()

Puzzle                     4691498
Action                     3412940
Casual                     3242897
Simulation                 2757136
Word                       1827741
Arcade                     1372731
Strategy                   1276408
Role Playing               1202308
Card                       1041830
Board                       883606
Racing                      812515
Adventure                   736299
Tools                       666860
None                        647453
Sports                      576169
Dating                      296753
Shopping                    287733
Music                       286998
Productivity                222971
Entertainment               218854
Books & Reference           211505
Health & Fitness            176483
Video Players & Editors     151221
Casino                      149003
Trivia                      134411
Music & Audio               115483
Education                    86473
Family                       77440
Social              

In [8]:
data['bundle'].value_counts()

com.fugo.wow                                       1678124
net.wargaming.wot.blitz                            1101717
com.openmygame.games.android.wordpizza              902581
com.axlebolt.standoff2                              632030
com.yourstoryinteractive.sails.pirate.adventure     491705
                                                    ...   
videoeditor.mixer.free                                   1
com.spider.granny.horror.scarry.gamev1                   1
com.boombitgames.CellConnect                             1
com.Para.Dollidol.Dressup                                1
com.toca.master                                          1
Name: bundle, Length: 85797, dtype: int64

In [9]:
data['shift'].value_counts()

MSK      26308751
MSK+2     5987717
MSK+3     2875851
MSK+1     1895297
MSK+7     1445292
MSK+4     1098024
MSK+5      821276
MSK+6      535227
MSK-1      198047
MSK+9       76542
MSK+8       21342
Name: shift, dtype: int64

In [10]:
data['oblast'].value_counts()

Москва                  7680369
Свердловская область    3079042
Санкт-Петербург         3070245
Краснодарский край      2998397
Татарстан               2243451
                         ...   
Витебская Область            36
Gomelskaya Oblast            29
Могилевская область          28
Гродненская Область          24
Минская Область              10
Name: oblast, Length: 90, dtype: int64

In [11]:
data['city'].value_counts()

Москва             7644426
Санкт-Петербург    3041452
Екатеринбург       2692591
Краснодар          2294394
Казань             1891205
                    ...   
Осовцы                   1
Дубовики                 1
Логойск                  1
Новогрудок               1
Урдома                   1
Name: city, Length: 2498, dtype: int64

In [12]:
data['os'].value_counts()

android    35245849
ios         5248627
Android     4253330
iOS          106477
Name: os, dtype: int64

In [13]:
data['osv'].value_counts()

10.0                  9447724
11.0                  4693227
9.0                   4542376
10.0.0                4274736
8.1.0                 2603022
                       ...   
iOS                         1
22(10.0)                    1
10.0 / API-29               1
666.0.0                     1
7.0_Lomaster_ROM.0          1
Name: osv, Length: 254, dtype: int64

## Даты

In [14]:
data['created'].min()

Timestamp('1970-01-01 03:00:00')

<div class='alert alert-info'> Особо одаренные пользовались смартфонами уже в 1970 году </div>

In [15]:
mask = data['created'] == data['created'].min()
print(f"Период данных: {data['created'][~mask].min()} - {data['created'].max()}")
print(f"Количество дат 1970-01-01: {mask.sum()}")

Период данных: 2021-07-05 00:00:00 - 2021-09-20 23:59:40
Количество дат 1970-01-01: 233


<div class='alert alert-info'> 233 из почти 45 миллионов записей - можно удалить </div>

# Подготовка данных

In [16]:
# Фильтрация
def filter_data(data):
    exclude_mask = data['created'] == pd.to_datetime('1970-01-01 03:00:00')
    return data[~exclude_mask]    

data = filter_data(data)

In [17]:
data['Segment'].value_counts()

5    17187468
3    14186984
4    11141964
2     1416243
1      921624
Name: Segment, dtype: int64

In [18]:
%%time

# Приведение типа
data['Segment'] = data['Segment'].astype('int8')

# Заполнение пропусков
data = data.fillna('missing')

Wall time: 2min 21s


# Генерация признаков

In [19]:
def get_time_features(data):
    data['hour_shift'] = data['shift'].apply(lambda x: x.replace('MSK', ''))
    data['hour_shift'] = data['hour_shift'].replace({'': 0, 'missing': 0}).astype('int8')
    
    data['local_time'] = data['created'] + pd.to_timedelta('1H') * data['hour_shift']
    data['month'] = data['local_time'].dt.month.astype('int8')
    data['date'] = data['local_time'].dt.date
    data['day'] = data['local_time'].dt.day.astype('int8')
    data['wd'] = data['local_time'].dt.weekday.astype('int8')
    data['hour'] = data['local_time'].dt.hour.astype('int8')
    data['minutes'] = data['local_time'].dt.minute.astype('int8')
    return data

In [20]:
data.head(2)

Unnamed: 0,Segment,gamecategory,subgamecategory,bundle,created,shift,oblast,city,os,osv
0,4,Games,Racing,com.MadOut.BIG,2021-07-05 18:07:40,MSK+6,Забайкальский Край,Чита,android,10.0
1,4,missing,missing,com.easybrain.solitaire.klondike.free,2021-07-10 10:38:42,MSK+2,Оренбургская область,Оренбург,Android,10.0.0


In [21]:
data = get_time_features(data)

In [22]:
data.head(2)

Unnamed: 0,Segment,gamecategory,subgamecategory,bundle,created,shift,oblast,city,os,osv,hour_shift,local_time,month,date,day,wd,hour,minutes
0,4,Games,Racing,com.MadOut.BIG,2021-07-05 18:07:40,MSK+6,Забайкальский Край,Чита,android,10.0,6,2021-07-06 00:07:40,7,2021-07-06,6,1,0,7
1,4,missing,missing,com.easybrain.solitaire.klondike.free,2021-07-10 10:38:42,MSK+2,Оренбургская область,Оренбург,Android,10.0.0,2,2021-07-10 12:38:42,7,2021-07-10,10,5,12,38


In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44854283 entries, 0 to 44854515
Data columns (total 18 columns):
 #   Column           Dtype         
---  ------           -----         
 0   Segment          int8          
 1   gamecategory     object        
 2   subgamecategory  object        
 3   bundle           object        
 4   created          datetime64[ns]
 5   shift            object        
 6   oblast           object        
 7   city             object        
 8   os               object        
 9   osv              object        
 10  hour_shift       int8          
 11  local_time       datetime64[ns]
 12  month            int8          
 13  date             object        
 14  day              int8          
 15  wd               int8          
 16  hour             int8          
 17  minutes          int8          
dtypes: datetime64[ns](2), int8(7), object(9)
memory usage: 4.3+ GB


In [26]:
with open('../data/processed/data.pcl', 'wb') as f:
    pickle.dump(data, f)

# Train test split

In [None]:
data['date'].value_counts().sort_index()

In [None]:
%%time
train = data[data['date'] <= pd.to_datetime('2021-08-09')]
test = data[data['date'] > pd.to_datetime('2021-08-09')]

In [None]:
with open('../data/processed/train.pcl', 'wb') as f:
    pickle.dump(train, f)

In [None]:
with open('../data/processed/test.pcl', 'wb') as f:
    pickle.dump(test, f)

In [None]:
train.head(2)