# Load the data

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from transformations import in_opponent_base, are_dead


In [2]:
# load the data and combine with test data to perform some of the trasformations (without standartization) on all data at the same time
train_df = pd.read_csv('train_features_plus.csv', index_col='match_id_hash')

test_df = pd.read_csv('test_features_plus.csv', index_col='match_id_hash')

# combine features into one dataframe for transformation
idx_split = train_df.shape[0]
full_df = pd.concat([train_df, test_df])
new_features = pd.DataFrame(index=full_df.index)
full_df.head(2)


Unnamed: 0_level_0,game_time,game_mode,lobby_type,objectives_len,chat_len,r1_hero_id,r1_kills,r1_deaths,r1_assists,r1_denies,...,d5_towers_killed,d5_roshans_killed,d5_obs_placed,d5_sen_placed,d5_ability_level,d5_max_hero_hit,d5_purchase_count,d5_count_ability_use,d5_damage_dealt,d5_damage_received
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a400b8f29dece5f4d266f49f1ae2e98a,155,22,7,1,11,11,0,0,0,0,...,0,0,0,0,0,164,6,4,2332,681
b9c57c450ce74a2af79c9ce96fac144d,658,4,0,3,10,15,7,2,0,7,...,0,0,0,0,4,164,11,7,2308,2154


# Building feautures on hero_ids

In [3]:
target = pd.read_csv(r'C:\Users\Tatiana\Documents\Programming\MLcourse competition\target_plus.csv', index_col = 'match_id_hash')['radiant_win']
train_full = train_df.merge(target, how='outer', left_index=True, right_index=True)
#hero id columns names
ls_r_hero_id = ['r{}_hero_id'.format(i) for i in range(1,6)]
ls_d_hero_id = ['d{}_hero_id'.format(i) for i in range(1,6)]
ls_hero_id = ls_r_hero_id + ls_d_hero_id
#sub data frame of hero ids and target
hero_ids = train_full[ls_hero_id +['radiant_win']]
hero_ids_rad_win = hero_ids[hero_ids['radiant_win'] == True] #rad wins
hero_ids_rad_lose = hero_ids[hero_ids['radiant_win'] == False] #rad loses
winning_hero_ids1 = hero_ids_rad_win[ls_r_hero_id]
winning_hero_ids2 = hero_ids_rad_lose[ls_d_hero_id]
losing_hero_ids1 = hero_ids_rad_win[ls_d_hero_id]
losing_hero_ids2 = hero_ids_rad_lose[ls_r_hero_id]
winning_hero_ids1.rename(columns = {'r1_hero_id':'1_id', 'r2_hero_id':'2_id',
                              'r3_hero_id':'3_id','r4_hero_id':'4_id',
                                    'r5_hero_id':'5_id'}, inplace = True)
winning_hero_ids2.rename(columns = {'d1_hero_id':'1_id', 'd2_hero_id':'2_id',
                              'd3_hero_id':'3_id','d4_hero_id':'4_id',
                                    'd5_hero_id':'5_id'}, inplace = True)
losing_hero_ids1.rename(columns = {'d1_hero_id':'1_id', 'd2_hero_id':'2_id',
                              'd3_hero_id':'3_id','d4_hero_id':'4_id',
                                    'd5_hero_id':'5_id'}, inplace = True)
losing_hero_ids2.rename(columns = {'r1_hero_id':'1_id', 'r2_hero_id':'2_id',
                              'r3_hero_id':'3_id','r4_hero_id':'4_id',
                                    'r5_hero_id':'5_id'}, inplace = True)
#for all games, df of winner's hero ids only
winning_hero_ids = pd.concat([winning_hero_ids1, winning_hero_ids2], axis=0)
#for all games, df of loser's hero ids only
losing_hero_ids = pd.concat([losing_hero_ids1, losing_hero_ids2], axis=0)

#by hero, in how many games did the hero win / lose
winning_hero_counts = winning_hero_ids['1_id'].value_counts().sort_index() + winning_hero_ids['2_id'].value_counts().sort_index() + winning_hero_ids['3_id'].value_counts().sort_index() + winning_hero_ids['4_id'].value_counts().sort_index() + winning_hero_ids['5_id'].value_counts().sort_index()
losing_hero_counts = losing_hero_ids['1_id'].value_counts().sort_index() + losing_hero_ids['2_id'].value_counts().sort_index() + losing_hero_ids['3_id'].value_counts().sort_index() + losing_hero_ids['4_id'].value_counts().sort_index() + losing_hero_ids['5_id'].value_counts().sort_index()

#to dictionary, key is hero id, value is win / loss count
winning_hero_counts = winning_hero_counts.sort_values()
winning_hero_dict = winning_hero_counts.to_dict()
losing_hero_counts = losing_hero_counts.sort_values()
losing_hero_dict = losing_hero_counts.to_dict()

#now subtract wins - loses by hero (this will be one feature)
hero_counts_win_minus_lose = winning_hero_counts.sort_index() - losing_hero_counts.sort_index()
diff_hero_dict = hero_counts_win_minus_lose.to_dict()

#normalize by dividing by total number of games played (this is another feature)
from collections import Counter
total_games_dict = Counter(winning_hero_dict) + Counter(losing_hero_dict)
hero_id_normalize_dict = {k: (diff_hero_dict[k] / total_games_dict[k]) for k in diff_hero_dict}

#add the two new features to the data frame
for col in ls_hero_id:
    full_df[col+'success'] = full_df[col].map(diff_hero_dict)
    full_df[col+'norm'] = full_df[col].map(hero_id_normalize_dict)

# Transform health

In [4]:
# add number of dead players at the end of the game in each team
d_health = ['d{}_health'.format(i) for i in range(1,6)]
new_features['d_dead'] = full_df[d_health].apply(are_dead, axis = 1)
r_health = ['r{}_health'.format(i) for i in range(1,6)]
new_features['r_dead'] = full_df[r_health].apply(are_dead, axis = 1)

# calculate the proportion of health
percentage_health = 0
for j in range(5):
    max_health = '{}_max_health'.format(d_health[j].split('_')[0])
    health = 'd{}_health'.format(j+1)
    ph = full_df[health]/full_df[max_health]
    percentage_health += 1/5*ph
new_features['d_health_avg'] = percentage_health

percentage_health = 0
for j in range(5):
    max_health = '{}_max_health'.format(r_health[j].split('_')[0])
    health = 'r{}_health'.format(j+1)
    ph = full_df[health]/full_df[max_health]
    percentage_health += 1/5*ph
   
new_features['r_health_avg'] = percentage_health


# Transform coordinates

In [5]:
r_x = ['r{}_x'.format(j) for j in range(1,6)]
r_y = ['r{}_y'.format(j) for j in range(1,6)]
d_x = ['d{}_x'.format(j) for j in range(1,6)]
d_y = ['d{}_y'.format(j) for j in range(1,6)]


In [6]:
# getting indicator function for each player

for j in range(5):
    rx = r_x[j]
    ry = r_y[j]
    dx = d_x[j]
    dy = d_y[j]

    new_features['d{}_in_r_base'.format(j+1)] = full_df.loc[:,[dx,dy]].apply(lambda x: in_opponent_base(x = x[dx], y = x[dy], opponent = 'Radiant'), axis = 1)
    new_features['r{}_in_r_base'.format(j+1)] = full_df.loc[:,[rx,ry]].apply(lambda x: in_opponent_base(x = x[rx], y = x[ry], opponent = 'Radiant'), axis = 1)
    new_features['d{}_in_d_base'.format(j+1)] = full_df.loc[:,[dx,dy]].apply(lambda x: in_opponent_base(x = x[dx], y = x[dy], opponent = 'Dire'), axis = 1)
    new_features['r{}_in_d_base'.format(j+1)] = full_df.loc[:,[rx,ry]].apply(lambda x: in_opponent_base(x = x[rx], y = x[ry], opponent = 'Dire'), axis = 1)
    
    full_df['d{}_in_r_base'.format(j+1)] = full_df.loc[:,[dx,dy]].apply(lambda x: in_opponent_base(x = x[dx], y = x[dy], opponent = 'Radiant'), axis = 1)
    full_df['r{}_in_r_base'.format(j+1)] = full_df.loc[:,[rx,ry]].apply(lambda x: in_opponent_base(x = x[rx], y = x[ry], opponent = 'Radiant'), axis = 1)
    full_df['d{}_in_d_base'.format(j+1)] = full_df.loc[:,[dx,dy]].apply(lambda x: in_opponent_base(x = x[dx], y = x[dy], opponent = 'Dire'), axis = 1)
    full_df['r{}_in_d_base'.format(j+1)] = full_df.loc[:,[rx,ry]].apply(lambda x: in_opponent_base(x = x[rx], y = x[ry], opponent = 'Dire'), axis = 1)
  

# Aggregate features within the team

### Aggregate by total value

In [7]:
# Create a feature that is sum of r1 + ... + r5 for all r features, and same for all d features.
# Features that do not make sence are removed.
new_feats = [i.replace("1","")  for i in full_df.columns.values if (i.startswith('r1')==1 or i.startswith('d1')==1)] 

rem_feat = ['r_health', 'r_max_health','r_level', 'r_teamfight_participation', 'd_health','d_max_health', 'd_level','d_teamfight_participation']#, 'd_x', 'd_y', 'r_x', 'r_y']

for feat in rem_feat:
    new_feats.remove(feat)

for feature in new_feats:
    feat_names = [(feature[0] + '{}_'.format(i) +feature[2:]) for i in range(1,6)]
    new_features[feature] = full_df.loc[:,feat_names].sum(axis=1)


new_features = new_features.drop(columns = ['r_deaths', 'd_deaths'])


### Aggregate by std()

In [8]:
for c in ['gold', 'xp','max_mana', 'level', 'ability_level', 'max_hero_hit', 'purchase_count',
          'count_ability_use', 'damage_dealt', 'damage_received']:
    r_columns = [f'r{i}_{c}' for i in range(1, 6)]
    d_columns = [f'd{i}_{c}' for i in range(1, 6)]
    
         
    new_features['r_std_' + c] = full_df[r_columns].std(1)
    new_features['d_std_' + c] = full_df[d_columns].std(1)
    
    
print(new_features.columns)  

Index(['d_dead', 'r_dead', 'd_health_avg', 'r_health_avg', 'd1_in_r_base',
       'r1_in_r_base', 'd1_in_d_base', 'r1_in_d_base', 'd2_in_r_base',
       'r2_in_r_base',
       ...
       'r_std_max_hero_hit', 'd_std_max_hero_hit', 'r_std_purchase_count',
       'd_std_purchase_count', 'r_std_count_ability_use',
       'd_std_count_ability_use', 'r_std_damage_dealt', 'd_std_damage_dealt',
       'r_std_damage_received', 'd_std_damage_received'],
      dtype='object', length=102)


# Aggreagate the level of players

In [9]:
d_levels = ['d{}_level'.format(j) for j in range(1,6)] 
d_avg_level = full_df.loc[:,d_levels].mean(axis = 1)
d_min_level = full_df.loc[:,d_levels].min(axis = 1)
d_max_level = full_df.loc[:,d_levels].max(axis = 1)
new_features['d_avg_level'] = d_avg_level
new_features['d_min_level'] = d_min_level
new_features['d_max_level'] = d_max_level


r_levels = ['r{}_level'.format(j) for j in range(1,6)] 
r_avg_level = full_df.loc[:,r_levels].mean(axis = 1)
r_min_level = full_df.loc[:,r_levels].min(axis = 1)
r_max_level = full_df.loc[:,r_levels].max(axis = 1)
new_features['r_avg_level'] = r_avg_level
new_features['r_min_level'] = r_min_level
new_features['r_max_level'] = r_max_level

new_features.head(2)

Unnamed: 0_level_0,d_dead,r_dead,d_health_avg,r_health_avg,d1_in_r_base,r1_in_r_base,d1_in_d_base,r1_in_d_base,d2_in_r_base,r2_in_r_base,...,r_std_damage_dealt,d_std_damage_dealt,r_std_damage_received,d_std_damage_received,d_avg_level,d_min_level,d_max_level,r_avg_level,r_min_level,r_max_level
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a400b8f29dece5f4d266f49f1ae2e98a,0,0,0.954967,0.7155,0,0,0,0,0,0,...,1940.546495,1141.711654,375.188353,244.929582,2.6,2,3,1.8,1,2
b9c57c450ce74a2af79c9ce96fac144d,0,0,0.764778,0.846735,0,1,0,0,0,0,...,7470.192601,4782.256455,2286.08635,2169.544929,6.0,4,7,7.4,6,9


# Adding non-team-specific data

In [10]:
# creating smaller datset
new_features = new_features.merge(full_df[['chat_len','game_time','game_mode','lobby_type']], how='outer', left_index=True, right_index=True)
new_features.head(3)

Unnamed: 0_level_0,d_dead,r_dead,d_health_avg,r_health_avg,d1_in_r_base,r1_in_r_base,d1_in_d_base,r1_in_d_base,d2_in_r_base,r2_in_r_base,...,d_avg_level,d_min_level,d_max_level,r_avg_level,r_min_level,r_max_level,chat_len,game_time,game_mode,lobby_type
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a400b8f29dece5f4d266f49f1ae2e98a,0,0,0.954967,0.7155,0,0,0,0,0,0,...,2.6,2,3,1.8,1,2,11,155,22,7
b9c57c450ce74a2af79c9ce96fac144d,0,0,0.764778,0.846735,0,1,0,0,0,0,...,6.0,4,7,7.4,6,9,10,658,4,0
6db558535151ea18ca70a6892197db41,0,0,0.989032,0.989333,0,0,1,0,0,0,...,1.0,1,1,1.0,1,1,0,21,23,0


# Add more features and take log(X) ans X^2 of selected features

In [11]:
new_features['ratio_assists'] = new_features['r_assists']/(new_features['d_assists']+1)
new_features['r_ratio_assists'] = new_features['r_assists']/(new_features['r_denies']+1)
new_features['d_ratio_assists'] = new_features['d_assists']/(new_features['d_denies']+1)

In [12]:
log_tags = ['_kills', '_gold', '_lh', '_xp', '_max_mana', '_creeps_stacked', 
               '_camps_stacked','_rune_pickups', '_purchase_count','_count_ability_use', '_damage_dealt',
            '_damage_received', '_max_hero_hit']
prefix = ['r', 'd']
col_for_log = []
for tag in log_tags:
    for p in prefix:
        col_for_log.append(p+tag)
col_for_log += ['chat_len', 'game_time','ratio_assists']

In [13]:
import math

for col in col_for_log:
    new_log_index = 'log_{}'.format(col)
#     print(new_features[col].min())
    new_log_col = new_features[col].apply(lambda x: math.log(x+1))
    new_features[new_log_index] = new_log_col
    
new_features.head()

Unnamed: 0_level_0,d_dead,r_dead,d_health_avg,r_health_avg,d1_in_r_base,r1_in_r_base,d1_in_d_base,r1_in_d_base,d2_in_r_base,r2_in_r_base,...,log_d_count_ability_use,log_r_damage_dealt,log_d_damage_dealt,log_r_damage_received,log_d_damage_received,log_r_max_hero_hit,log_d_max_hero_hit,log_chat_len,log_game_time,log_ratio_assists
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a400b8f29dece5f4d266f49f1ae2e98a,0,0,0.954967,0.7155,0,0,0,0,0,0,...,2.484907,9.075322,9.153347,8.353026,7.349874,5.209486,6.042633,2.484907,5.049856,0.0
b9c57c450ce74a2af79c9ce96fac144d,0,0,0.764778,0.846735,0,1,0,0,0,0,...,4.634729,11.152902,10.754044,9.961945,10.104794,6.877296,6.639876,2.397895,6.490724,1.504077
6db558535151ea18ca70a6892197db41,0,0,0.989032,0.989333,0,0,1,0,0,0,...,0.693147,4.26268,4.234107,4.234107,4.26268,4.26268,4.234107,0.0,3.091042,0.0
46a0ddce8f7ed2a8d9bd5edcbb925682,0,0,0.869557,0.955818,0,0,0,0,0,0,...,4.418841,10.827389,11.142325,9.632138,10.03491,6.651572,6.169611,1.609438,6.357842,1.609438
b1b35ff97723d9b7ade1c9c3cf48f770,0,0,0.808481,0.735127,0,0,0,0,0,0,...,4.26268,10.543076,10.900879,9.67206,9.148997,6.12905,6.499787,1.386294,6.118097,0.693147


In [14]:
col_for_sq = ['r_gold', 'r_xp', 'd_gold', 'd_xp']

for col in col_for_sq:
    new_sq_index = 'sq_{}'.format(col)
#     print(new_features[col].min())
    new_sq_col = new_features[col].apply(lambda x: x**2)
    new_features[new_sq_index] = new_sq_col
    
new_features.head()

Unnamed: 0_level_0,d_dead,r_dead,d_health_avg,r_health_avg,d1_in_r_base,r1_in_r_base,d1_in_d_base,r1_in_d_base,d2_in_r_base,r2_in_r_base,...,log_d_damage_received,log_r_max_hero_hit,log_d_max_hero_hit,log_chat_len,log_game_time,log_ratio_assists,sq_r_gold,sq_r_xp,sq_d_gold,sq_d_xp
match_id_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a400b8f29dece5f4d266f49f1ae2e98a,0,0,0.954967,0.7155,0,0,0,0,0,0,...,7.349874,5.209486,6.042633,2.484907,5.049856,0.0,4149369,3964081,15586704,9511056
b9c57c450ce74a2af79c9ce96fac144d,0,0,0.764778,0.846735,0,1,0,0,0,0,...,10.104794,6.877296,6.639876,2.397895,6.490724,1.504077,330221584,308564356,172764736,167055625
6db558535151ea18ca70a6892197db41,0,0,0.989032,0.989333,0,0,1,0,0,0,...,4.26268,4.26268,4.234107,0.0,3.091042,0.0,774400,0,230400,0
46a0ddce8f7ed2a8d9bd5edcbb925682,0,0,0.869557,0.955818,0,0,0,0,0,0,...,10.03491,6.651572,6.169611,1.609438,6.357842,1.609438,178169104,215003569,127712601,140256649
b1b35ff97723d9b7ade1c9c3cf48f770,0,0,0.808481,0.735127,0,0,0,0,0,0,...,9.148997,6.12905,6.499787,1.386294,6.118097,0.693147,65270241,95296644,84217329,116769636


# Recording the file

In [15]:
# splitting features back to 
train = new_features.iloc[:idx_split, :]
test = new_features.iloc[idx_split:, :]

In [16]:
train.to_csv('new_feat_train_small.csv')
test.to_csv('new_feat_test_small.csv')

In [17]:
test.shape

(10000, 148)