In [69]:
import sys
! {sys.executable} -m pip install see

Collecting see
  Downloading see-1.4.1-py2.py3-none-any.whl (12 kB)
Installing collected packages: see
Successfully installed see-1.4.1


In [40]:
import pandas as pd
import numpy as np

In [8]:
!cat ../data/655/*.jsonl > /tmp/655.jsonl
df = pd.read_json("/tmp/655.jsonl",lines=True, convert_dates=False)
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')

In [169]:
from collections import defaultdict

class PredictModel:
    min_val = 1
    max_val = 55
    number_predict = 6
    ticket_price = 10000
    
    prices = {
        6: 20000000000,
        5: 5000000000,
        4: 500000,
        3: 50000
    }

    col_date = 'date'
    col_result = 'result'
    col_predict = 'predicted'
    col_predict_time = 'predict_time'
    col_predict_metadata = 'predict_metadata'
    col_correct = 'is_correct'
    col_correct_num = 'correct_num'

    def __init__(self, df, time_predict=1):
        self.df = df
        self.df_backtest = None
        self.df_backtest_explode = None
        self.df_backtest_evaluate = None
        self.time_predict = time_predict

    @classmethod
    def _count_number(cls, number_series):
        return number_series.explode().value_counts().to_frame('times')

    @classmethod
    def _compare_list(cls, l1, l2):
        l1_s = set(l1)
        l2_s = set(l2)
        inter = l1_s.intersection(l2_s)
        return len(inter) == len(l1), len(inter)

    def predict(self, date):
        pass

    def backtest(self):
        _df = self.df.copy()

        def fn_apply(row):
            predicted = []
            for i in range(self.time_predict):
                loop_predict = self.predict(row.date)
                correct, correct_num = self._compare_list(row.result, loop_predict)
                predicted.append({
                    PredictModel.col_predict + '_idx': i,
                    PredictModel.col_predict: loop_predict,
                    PredictModel.col_correct: correct,
                    PredictModel.col_correct_num: correct_num,
                })
                
            return predicted
        
        _df['predict_metadata'] = _df.apply(fn_apply, axis=1)
        self.df_backtest = _df

    def evaluate(self):
        self.df_backtest_explode = self.df_backtest.explode(PredictModel.col_predict_metadata)
        self.df_backtest_evaluate = pd.concat([
            self.df_backtest_explode.reset_index(drop=True), 
            pd.json_normalize(self.df_backtest_explode[PredictModel.col_predict_metadata])], axis='columns')

        print(f"correct time: {self.df_backtest_evaluate[PredictModel.col_correct].sum()}")
        print(f"count correct num: {self.df_backtest_evaluate.value_counts(PredictModel.col_correct_num)}")

    def revenue(self):
        cost = len(self.df_backtest_evaluate) * self.ticket_price
        gain = self.df_backtest_evaluate[PredictModel.col_correct_num].apply(lambda v: self.prices.get(v, 0)).sum()

        return cost, gain, gain - cost

class RandomModel(PredictModel):
    def predict(self, *args, **kwargs):
        import random
        nums=list(range(PredictModel.min_val, PredictModel.max_val))
        random.shuffle(nums)

        return nums[:PredictModel.number_predict]

In [177]:
total_cost = 0
total_gain = 0
for idx in range(10):
    model_random = RandomModel(df, time_predict=10)
    model_random.backtest()
    model_random.evaluate()
    cost, gain, revenue = model_random.revenue()

    total_cost += cost
    total_gain += gain

correct time: 0
count correct num: correct_num
0    2997
1    2942
2    1016
3     133
4      12
dtype: int64
correct time: 0
count correct num: correct_num
0    2991
1    2923
2    1021
3     156
4       9
dtype: int64
correct time: 0
count correct num: correct_num
0    3019
1    2928
2     995
3     153
4       5
dtype: int64
correct time: 0
count correct num: correct_num
0    3019
1    2925
2     991
3     155
4      10
dtype: int64
correct time: 0
count correct num: correct_num
0    3011
1    2844
2    1088
3     144
4      12
5       1
dtype: int64
correct time: 0
count correct num: correct_num
1    2982
0    2976
2     995
3     143
4       4
dtype: int64
correct time: 0
count correct num: correct_num
1    2967
0    2948
2    1021
3     152
4      12
dtype: int64
correct time: 0
count correct num: correct_num
0    2978
1    2925
2    1037
3     149
4      11
dtype: int64
correct time: 0
count correct num: correct_num
0    3028
1    2952
2     975
3     136
4       9
dtype: int64


In [183]:
print(f"{total_cost:,}, {total_gain:,}")
total_gain - total_cost

710,000,000, 5,122,350,000


4412350000

In [166]:
pd.json_normalize(model_random.df_backtest_explode[PredictModel.col_predict_metadata])

Unnamed: 0,predicted_idx,predicted,is_correct,correct_num
0,0,"[37, 51, 14, 47, 12, 52]",False,1
1,1,"[25, 14, 38, 15, 11, 7]",False,2
2,2,"[22, 46, 33, 30, 24, 53]",False,1
3,3,"[1, 49, 48, 33, 50, 42]",False,0
4,4,"[4, 15, 5, 19, 17, 2]",False,1
...,...,...,...,...
7095,5,"[7, 9, 40, 31, 54, 43]",False,1
7096,6,"[40, 48, 53, 37, 10, 7]",False,1
7097,7,"[51, 15, 43, 20, 34, 24]",False,1
7098,8,"[24, 47, 29, 25, 41, 44]",False,0


In [168]:
pd.concat([
    model_random.df_backtest_explode.reset_index(drop=True),
    pd.json_normalize(model_random.df_backtest_explode[PredictModel.col_predict_metadata])
], axis='columns')


Unnamed: 0,date,id,result,page,predict_metadata,predicted_idx,predicted,is_correct,correct_num
0,2017-08-01,1,"[5, 10, 14, 23, 24, 35, 38]",59,"{'predicted_idx': 0, 'predicted': [37, 51, 14,...",0,"[37, 51, 14, 47, 12, 52]",False,1
1,2017-08-01,1,"[5, 10, 14, 23, 24, 35, 38]",59,"{'predicted_idx': 1, 'predicted': [25, 14, 38,...",1,"[25, 14, 38, 15, 11, 7]",False,2
2,2017-08-01,1,"[5, 10, 14, 23, 24, 35, 38]",59,"{'predicted_idx': 2, 'predicted': [22, 46, 33,...",2,"[22, 46, 33, 30, 24, 53]",False,1
3,2017-08-01,1,"[5, 10, 14, 23, 24, 35, 38]",59,"{'predicted_idx': 3, 'predicted': [1, 49, 48, ...",3,"[1, 49, 48, 33, 50, 42]",False,0
4,2017-08-01,1,"[5, 10, 14, 23, 24, 35, 38]",59,"{'predicted_idx': 4, 'predicted': [4, 15, 5, 1...",4,"[4, 15, 5, 19, 17, 2]",False,1
...,...,...,...,...,...,...,...,...,...
7095,2022-04-14,710,"[1, 5, 9, 34, 37, 45, 52]",0,"{'predicted_idx': 5, 'predicted': [7, 9, 40, 3...",5,"[7, 9, 40, 31, 54, 43]",False,1
7096,2022-04-14,710,"[1, 5, 9, 34, 37, 45, 52]",0,"{'predicted_idx': 6, 'predicted': [40, 48, 53,...",6,"[40, 48, 53, 37, 10, 7]",False,1
7097,2022-04-14,710,"[1, 5, 9, 34, 37, 45, 52]",0,"{'predicted_idx': 7, 'predicted': [51, 15, 43,...",7,"[51, 15, 43, 20, 34, 24]",False,1
7098,2022-04-14,710,"[1, 5, 9, 34, 37, 45, 52]",0,"{'predicted_idx': 8, 'predicted': [24, 47, 29,...",8,"[24, 47, 29, 25, 41, 44]",False,0


In [158]:
model_random.df_backtest.explode(PredictModel.col_predict_metadata)
pd.concat([model_random.df_backtest_explode, 
            pd.json_normalize(model_random.df_backtest_explode[PredictModel.col_predict_metadata])], axis='columns')

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

# Lession

Don't gambling, haha