In [44]:
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import json
import s3fs

In [120]:
def get_data():
    fs = s3fs.S3FileSystem(anon=True)
    data = []
    with fs.open('s3://perfect.timing/grong-runs.almost-json', 'rb') as f:
        for line in f:
            data.append(json.loads(line))
    return data

In [131]:
data = get_data()

In [121]:
# XjpfnxGVvaM3CZgb.json
[
    {'fight_id': 2, 'predictions': [[30, .53], [60, .32], [90, 0.25]]}
]

[{'fight_id': 2, 'predictions': [[30, 0.53], [60, 0.32], [90, 0.25]]}]

In [132]:
def only_grong(training_data, v, raid_id):
    if v and v['boss'] == 2263 and v['periods']:
        for ps in v['periods']:
            period = ps['ts-period']
            ps['raid_id'] = raid_id
            ps['kill'] = v['kill']
            training_data.setdefault(period, [])
            training_data[period].append(ps)

In [133]:
def data_transform(data):
    training_data = {}
    single_case = {}
    for d in data:
        if d:
            for raid_id, v in d.items():
                if not raid_id.startswith('xqMN1ArjGvpXDLcz'):
                    only_grong(training_data, v, raid_id)
                else:
                    only_grong(single_case, v, raid_id)
    return training_data, single_case

In [134]:
training_data, single_case = data_transform(data)

In [None]:
# xqMN1ArjGvpXDLcz.json
{10: [[30, 0.4864382751041898],
  [60, 0.5156320225239619],
  [90, 0.004746332097930662],
  [120, 0.0005584344248507168]],
 8: [[30, 0.4740539229317621],
  [60, 0.4174048033537797],
  [90, 0.2566418667630769],
  [120, 0.29307077931972425],
  [150, 0.6438731477826377],
  [180, 0.025086994655948467]],
 13: [[30, 0.42920330117213873],
  [60, 0.4293914636474287],
  [90, 0.5954389490222433],
  [120, 0.6185105349379931],
  [150, 0.6671047712171181],
  [180, 0.7217274821561459],
  [210, 0.012486066360858597],
  [240, 0.0016096308934100629]],
 9: [[30, 0.37854993554880256],
  [60, 0.4492429686694587],
  [90, 0.688804702341058],
  [120, 0.6940732047205519],
  [150, 0.7255504571795206],
  [180, 0.6882051889868604],
  [210, 0.006059650525518832],
  [240, 0.0025490794310735984]],
 11: [[30, 0.4598978701418732],
  [60, 0.5111605529336872],
  [90, 0.5536743161115566],
  [120, 0.5673377934967901],
  [150, 0.2956018606911454],
  [180, 0.003293664442476165]],
 14: [[30, 0.41670080000910364],
  [60, 0.45378689943937617],
  [90, 0.4799345253478957],
  [120, 0.48104525316241276],
  [150, 0.42925826680638346],
  [180, 0.5769215751184661],
  [210, 0.8249744373870858],
  [240, 0.9173063857087477],
  [270, 0.9479688814866076],
  [300, 0.9941063420793372]],
 12: [[30, 0.3793245176157243],
  [60, 0.3347207658043423],
  [90, 0.024994679192818992],
  [120, 0.002798971565707081]]}

In [138]:
def train_test_score(training_data, single_case):
    scores = {}
    for p, v in training_data.items():
        print(f'Running period: {p}')
        df = pd.DataFrame.from_records(v)
        x_train, x_test, y_train, y_test = train_test_split(
            df.drop(columns=['kill', 'raid_id', 'period']),
            df.kill,
            stratify=df.kill)
        lr = LogisticRegression()
        try:
            lr.fit(x_train, y_train)
            records = single_case.get(p)
            if records:
                sdf = pd.DataFrame.from_records(records)
                y_scores = [pr[1] for pr in lr.predict_proba(sdf.drop(columns=['kill', 'raid_id', 'period']))]
                fight_tuples = list(zip(sdf.raid_id, y_scores))
                for fid, prob in fight_tuples:
                    fight_id = int(fid.split('-')[1])
                    scores.setdefault(fight_id, [])
                    scores[fight_id].append([p, prob])
        except ValueError as ve:
            print(f'Received {ve}')
    return scores

In [140]:
train_test_score(training_data, single_case)

Running period: 30
Running period: 60
Running period: 90
Running period: 120
Running period: 150
Running period: 180
Running period: 210




Running period: 240
Running period: 270
Running period: 300
Running period: 330
Running period: 360
Running period: 390
Running period: 420
Received This solver needs samples of at least 2 classes in the data, but the data contains only one class: False




{10: [[30, 0.4864382751041898],
  [60, 0.5156320225239619],
  [90, 0.004746332097930662],
  [120, 0.0005584344248507168]],
 8: [[30, 0.4740539229317621],
  [60, 0.4174048033537797],
  [90, 0.2566418667630769],
  [120, 0.29307077931972425],
  [150, 0.6438731477826377],
  [180, 0.025086994655948467]],
 13: [[30, 0.42920330117213873],
  [60, 0.4293914636474287],
  [90, 0.5954389490222433],
  [120, 0.6185105349379931],
  [150, 0.6671047712171181],
  [180, 0.7217274821561459],
  [210, 0.012486066360858597],
  [240, 0.0016096308934100629]],
 9: [[30, 0.37854993554880256],
  [60, 0.4492429686694587],
  [90, 0.688804702341058],
  [120, 0.6940732047205519],
  [150, 0.7255504571795206],
  [180, 0.6882051889868604],
  [210, 0.006059650525518832],
  [240, 0.0025490794310735984]],
 11: [[30, 0.4598978701418732],
  [60, 0.5111605529336872],
  [90, 0.5536743161115566],
  [120, 0.5673377934967901],
  [150, 0.2956018606911454],
  [180, 0.003293664442476165]],
 14: [[30, 0.41670080000910364],
  [60, 0.4