In [65]:
import re, itertools as itr, collections as col, pandas as pd, langdetect, io

In [66]:
with io.open('bounce.log', encoding='utf8') as f: contents = f.read()

In [27]:
pat = re.compile(r"""'player' '(.+?) \(\d+?\)' 'with style' '(.+?)' 'joined \(client (\d+?)\)'""")
def joins():
    for m in pat.finditer(contents):
        name, sty, cli = m.groups()
        yield cli, name, sty

# Popular characters

In [28]:
def rank_chars():
    for cli, group in itr.groupby(sorted(joins()), key=lambda (cli, name, sty): cli):
        [(pop_sty, count)] = col.Counter(sty for cli, name, sty in group).most_common(1)
        yield cli, pop_sty
col.Counter(sty for cli, sty in rank_chars()).most_common(99)

[('robot-0', 3964),
 ('alien-0', 3007),
 ('spacesuit-0', 2853),
 ('plumber-0', 2736),
 ('plain-1', 2604),
 ('robot-2', 2101),
 ('spacesuit-1', 2009),
 ('plain-0', 1961),
 ('skeleton-2', 1732),
 ('plain-2', 1702),
 ('skeleton-0', 1499),
 ('alien-2', 1332),
 ('spacesuit-2', 1237),
 ('skeleton-1', 1165),
 ('robot-1', 1155),
 ('plumber-2', 1086),
 ('plumber-1', 910),
 ('alien-1', 415),
 ('reddit-0', 199),
 ('slender-0', 42)]

# Repeat plays

In [32]:
cli2nplays = col.Counter(cli for cli, name, sty in joins())
df = pd.DataFrame(cli2nplays.items(), columns=['cli','nplays'])
df.describe()

Unnamed: 0,nplays
count,33709.0
mean,3.999852
std,6.450676
min,1.0
25%,1.0
50%,1.0
75%,4.0
max,89.0


In [36]:
col.Counter((cli, name) for cli, name, sty in joins()).most_common(30)

[(('7426885', 'baymax'), 89),
 (('2094240', 'DAMIAM'), 82),
 (('10486048', 'LUKY'), 71),
 (('3329323', 'noob'), 68),
 (('742933', '...'), 68),
 (('1882289', 'Anonymous Stomper'), 67),
 (('7653037', "Don\\'t try it Anakin"), 62),
 (('4563954', '???'), 62),
 (('8032635', 'ROEI'), 59),
 (('4245337', 'vn?'), 59),
 (('10675306', 'theskullguy158'), 58),
 (('1561197', 'vegito'), 58),
 (('369703', 'Lance12'), 58),
 (('6297726', 'smasher'), 58),
 (('6373959', 'tgghgtgfggf'), 57),
 (('6180800', 'lol'), 57),
 (('12374220', "Don\\'t try it Anakin!"), 54),
 (('5181027', 'a'), 53),
 (('342902', 'uio'), 52),
 (('45511', 'lolol'), 52),
 (('913071', 'foxy'), 51),
 (('9027453', 'ATH'), 51),
 (('10783722', 'yo'), 50),
 (('6327197', 'sans'), 50),
 (('2011985', 'ytutfk'), 50),
 (('1067262', 'Tristan Man'), 50),
 (('477637', 'destroyer'), 50),
 (('6209360', 'king of north'), 50),
 (('2531840', 'fk you'), 50),
 (('6665971', 'bnb'), 50)]

# Names / Repeat Visitors

In [67]:
name_freq = col.Counter(name for cli, name in {(cli, name) for cli, name, sty in joins()})
name_freq.most_common(30)

[(u'lol', 416),
 (u'hi', 353),
 (u'mario', 256),
 (u'Mario', 160),
 (u'a', 117),
 (u'paku.io', 108),
 (u'juan', 99),
 (u'123', 94),
 (u'asd', 91),
 (u'gg', 86),
 (u'1', 85),
 (u'bob', 84),
 (u'f', 74),
 (u'hola', 71),
 (u'deathviper79', 71),
 (u'noob', 69),
 (u'g', 67),
 (u'pop', 64),
 (u'yourself', 63),
 (u'd', 62),
 (u'Luigi', 61),
 (u'alex', 59),
 (u's', 57),
 (u'damiam', 54),
 (u'box', 54),
 (u'eryh', 53),
 (u'io', 53),
 (u'j', 53),
 (u'h', 52),
 (u'HI', 50)]

# Special name searches

In [57]:
lower_name_freq = col.Counter()
for name, count in name_freq.items():
    lower_name_freq[name.lower()] = lower_name_freq.get(name.lower(), 0) + count
names = '''
fady
paku.io
markiplier
pewdiepie
jacksepticeye
kwebblekop
'''.split()
[(name, lower_name_freq.get(name)) for name in names]    

[('fady', 83),
 ('paku.io', 111),
 ('markiplier', 3),
 ('pewdiepie', 1),
 ('jacksepticeye', None),
 ('kwebblekop', 2)]

# Name regions

In [81]:
def try_detect(name):
    try: return langdetect.detect(name)
    except: return ''
lang2names = {}
for name in name_freq:
    lang = try_detect(name)
    lang2names.setdefault(lang, []).append(name)

1456 en
death...from...above
The Worst
eshea
MOSHE REAPER!!!!!!!!!!!!!!!!!!!!
im.new.dont.atack.me
My name is Jeef\'s
I am the one
but did NOT stomp that guy
IT IS ME MAREO
athal

1265 cy
ytdefb
Mr.Joey
hfghfgt
dqsdysu
g4h35b4
r66
ethan-.,.269-yulio
rgdfg
pawel
gthyyy7ui

1224 de
ULTRA ZERG
kdfhnfghsdfhd
BLAHBLAHHHHHHHHHHHHHHHHHHHHHHHHH
DSFDSFDSFFD
FJHSALKGEJHSLKGJ
BEN
thunder
herb
Dick Kickem
SMDC

1001 so
daynis
oooh
o oooo
yeerk
lrrkwqwl
baymax
vaaaaaaaaaaaaaddajbhf
WARIO!
darkwood
hidalgo

795 tl
no kill
antip2010
Bayonetta
pato
gg\\
agirl
ysysgkayxuv
A
sasasa
ALEIN

783 sw
hjikig
MUAHAHAHAHAHAHAHAHAHAHAHAHAHAHAH
MSJHGFIUHFDUIJGHR
k
MIGIT
want
7uyjhjhuy
Hia
samuzao
wales

688 
(0_0)(0_0)
[
88888888888888888888888888888
9+
224522203
98
ÔΩî
111111111111111
1000
55546654

603 fi
luis y mami
Rafael fakkkkkkkkkkkkkkkkkkkkkkk
Trissitan
aishmarjuliethc
LittyTitties
ona_777
axtronautita Blanko
osin
sssssssssssss
JustSkill

597 id
Sugar
Seberstjan
Kman
Jumper
ILYAS
SkupaCZ
Mar.IO
BDPanda
ia

In [82]:
groupings = [(len(names), lang, names) for lang, names in lang2names.items()]
for a, b, cs in list(reversed(sorted(groupings)))[:100]:
    print a,b
    for c in cs[:10]:
        print c
    print

1456 en
death...from...above
The Worst
eshea
MOSHE REAPER!!!!!!!!!!!!!!!!!!!!
im.new.dont.atack.me
My name is Jeef\'s
I am the one
but did NOT stomp that guy
IT IS ME MAREO
athal

1265 cy
ytdefb
Mr.Joey
hfghfgt
dqsdysu
g4h35b4
r66
ethan-.,.269-yulio
rgdfg
pawel
gthyyy7ui

1224 de
ULTRA ZERG
kdfhnfghsdfhd
BLAHBLAHHHHHHHHHHHHHHHHHHHHHHHHH
DSFDSFDSFFD
FJHSALKGEJHSLKGJ
BEN
thunder
herb
Dick Kickem
SMDC

1001 so
daynis
oooh
o oooo
yeerk
lrrkwqwl
baymax
vaaaaaaaaaaaaaddajbhf
WARIO!
darkwood
hidalgo

795 tl
no kill
antip2010
Bayonetta
pato
gg\\
agirl
ysysgkayxuv
A
sasasa
ALEIN

783 sw
hjikig
MUAHAHAHAHAHAHAHAHAHAHAHAHAHAHAH
MSJHGFIUHFDUIJGHR
k
MIGIT
want
7uyjhjhuy
Hia
samuzao
wales

688 
(0_0)(0_0)
[
88888888888888888888888888888
9+
224522203
98
ÔΩî
111111111111111
1000
55546654

603 fi
luis y mami
Rafael fakkkkkkkkkkkkkkkkkkkkkkk
Trissitan
aishmarjuliethc
LittyTitties
ona_777
axtronautita Blanko
osin
sssssssssssss
JustSkill

597 id
Sugar
Seberstjan
Kman
Jumper
ILYAS
SkupaCZ
Mar.IO
BDPanda
ia