# Parsing game logs

Tristan Miller, 4/28/2018

The goal here is to use game logs to determine which cards have the most impact.  Impact is measured by how much the presence of a card changes the players' gains. Only gains that are in the supply count (so no prizes, spoils).

The data set is 140,000 game logs in which at least one of the players was in the top 100.  The game logs date back to Guilds, which makes some things easier.

In [1]:
#packages
import os
import re
import pdb
import numpy as np
import pickle
import time
import pandas as pd

In [2]:
filelist = os.listdir('logs')

In [3]:
#special cases for ruins and knights
ruins = ['Ruined Market','Ruined Village','Abandoned Mine','Survivors','Ruined Library']
knights = ['Dame Josephine','Dame Sylvia','Dame Natalie','Dame Anna','Dame Molly','Sir Martin','Sir Destry','Sir Bailey','Sir Michael','Sir Vander']
special_cases = dict.fromkeys(knights,'Knights')
for ruin in ruins:
    special_cases[ruin] = 'Ruins'

In [4]:
#Parse a log, extracting the cards in the supply, and the number of cards gained by each player
def parse_log(filepath):
    with open(filepath,'r',encoding='utf-8') as f:
        #first, get the supply cards
        counter = 0
        while( counter < 20 ):
            counter += 1
            line = f.readline()
            m = re.match('Supply cards: (.+)$',line)
            if( m ):
                supply = re.split(', ',m.group(1))
                break
             
        #get names of players
        while( counter < 20 ):
            counter += 1
            line = f.readline()
            m = re.match('(.+) - starting cards:',line)
            if( m ):
                p1name = re.escape(m.group(1))
                break
                
        while( counter < 20 ):
            counter += 1
            line = f.readline()
            m = re.match('(.+) - starting cards:',line)
            if( m ):
                p2name = re.escape(m.group(1))
                break
                
        if counter == 20:
            #If the game hasn't found the header yet, something went wrong
            print('Error: could not parse header in',filepath)
            raise
        
        #Next, calculate the number of times each card was gained
        p1gains = dict.fromkeys(supply,0)
        p2gains = dict.fromkeys(supply,0)
        for line in f:
            m = re.match('(' + p1name + '|' + p2name + ') - gains (.+)$',line)
            if( m ):
                if m.group(1) == p1name:
                    gains = p1gains
                else:
                    gains = p2gains
                if m.group(2) in gains:
                    gains[m.group(2)] += 1
                elif m.group(2) in special_cases and special_cases[m.group(2)] in gains:
                    gains[special_cases[m.group(2)]] += 1
    return supply, p1gains, p2gains


In [5]:
#reads supply only
def parse_supply(filepath):
    with open(filepath,'r',encoding='utf-8') as f:
        #first, get the supply cards
        while( True ):
            line = f.readline()
            m = re.match('Supply cards: (.+)$',line)
            if( m ):
                supply = re.split(', ',m.group(1))
                break
    return supply

In [6]:
#print the log (for testing)
def print_log(filenum):
    with open('logs/'+filelist[filenum],'r',encoding='utf-8') as f:
        print(f.read())

In [7]:
#get a list of all supply cards
#card_list is a list of all unique cards
#card_dict is a dict of all unique cards, with the value being the position in card_list

card_dict = {}
for i,filename in zip(range(2000),filelist[-2000:]):
    try:
        supply = parse_supply('logs/'+filename)
        for card in supply:
            card_dict[card] = 0
    except:
        print(i)

card_list = ['']*len(card_dict)
for i,card in enumerate(card_dict.keys()):
    card_dict[card] = i
    card_list[i] = card

In [8]:
print(len(card_list))

217


In [18]:
#Parse the logs, and combine data
#num_files is the number of files to read (by default all of them)
#num_games is an ndarray of the number of games with each pair of cards
#game_gains is an ndarray of the number of games where [col] was gained, and had [row] in the kingdom
#total_gains is an ndarray of the number of gains of [col] in kingdoms that have [row]
#note that games are double counted, since we look at it from the point of view of each player
def combine_logs(first_file = 0, last_file = -1,verbose=False):
    start_time = time.time()
    num_games = np.zeros((len(card_list),len(card_list)),dtype='int')
    game_gains = np.zeros((len(card_list),len(card_list)),dtype='int') 
    total_gains = np.zeros((len(card_list),len(card_list)),dtype='int')
    
    if last_file < 0:
        last_file = len(filelist)
    
    for i,filename in zip(range(first_file,last_file),filelist[first_file:last_file]):
        try:
            supply, p1gains, p2gains = parse_log('logs/'+filename)
            for card1 in supply:
                for card2 in supply:
                    p = card_dict[card1]
                    q = card_dict[card2]
                    num_games[p][q] += 2
                    total_gains[p][q] += p1gains[card2] + p2gains[card2]
                    game_gains[p][q] += (1 if p1gains[card2] > 0 else 0) + (1 if p2gains[card2] > 0 else 0)
        except:
            print('error in file',filename)
        if i % 1000 == 0:
            print(i,'files processed in',(time.time() - start_time)/60,'minutes')
                
    return num_games, game_gains, total_gains
    

In [19]:
num_games, game_gains, total_gains = combine_logs()

#Let's save pickles so I don't have to run this again.
with open('gain_matrices.pkl', 'wb') as handle:
    pickle.dump((num_games, game_gains, total_gains),handle)
with open('card_list.pkl', 'wb') as handle:
    pickle.dump((card_list,card_dict),handle)

0 files processed in 0.0011833389600118002 minutes
1000 files processed in 0.35548386971155804 minutes
2000 files processed in 0.7260510842005412 minutes
3000 files processed in 1.2062184890111287 minutes
4000 files processed in 1.5665857076644898 minutes
5000 files processed in 1.930069589614868 minutes
6000 files processed in 2.3793583552042645 minutes
7000 files processed in 2.8077518343925476 minutes
8000 files processed in 3.1695357163747153 minutes
9000 files processed in 3.518402910232544 minutes
10000 files processed in 3.8773034532864887 minutes
11000 files processed in 4.257620708147685 minutes
12000 files processed in 4.613687924544016 minutes
13000 files processed in 4.971588456630707 minutes
14000 files processed in 5.341205664475759 minutes
15000 files processed in 5.687772858142853 minutes
16000 files processed in 6.035590056578318 minutes
17000 files processed in 6.38902390797933 minutes
18000 files processed in 6.748841126759847 minutes
19000 files processed in 7.11997