In [7]:
import pandas as pd

In [15]:
df = pd.read_json("../data/power655.jsonl", lines=True)

In [47]:
class PredictModel:
    min_val = 1
    max_val = 55
    number_predict = 6
    ticket_price = 10000

    prices = {6: 40_000_000_000, 5: 5_000_000_000, 4: 500000, 3: 50000}

    col_date = "date"
    col_result = "result"
    col_predict = "predicted"
    col_predict_time = "predict_time"
    col_predict_metadata = "predict_metadata"
    col_correct = "is_correct"
    col_correct_num = "correct_num"

    def __init__(self, df, time_predict=1):
        self.df = df
        self.df_backtest = None
        self.df_backtest_explode = None
        self.df_backtest_evaluate = None
        self.time_predict = time_predict

    @classmethod
    def _count_number(cls, number_series):
        return number_series.explode().value_counts().to_frame("times")

    @classmethod
    def _compare_list(cls, l1, l2):
        l1_s = set(l1)
        l2_s = set(l2)
        inter = l1_s.intersection(l2_s)
        return len(inter) == len(l1), len(inter)

    def predict(self, date):
        pass

    def backtest(self):
        _df = self.df.copy()

        def fn_apply(row):
            predicted = []
            for i in range(self.time_predict):
                loop_predict = self.predict(row.date)
                correct, correct_num = self._compare_list(row.result, loop_predict)
                predicted.append(
                    {
                        PredictModel.col_predict + "_idx": i,
                        PredictModel.col_predict: loop_predict,
                        PredictModel.col_correct: correct,
                        PredictModel.col_correct_num: correct_num,
                    }
                )

            return predicted

        _df["predict_metadata"] = _df.apply(fn_apply, axis=1)
        self.df_backtest = _df

    def evaluate(self):
        self.df_backtest_explode = self.df_backtest.explode(PredictModel.col_predict_metadata)
        self.df_backtest_evaluate = pd.concat(
            [
                self.df_backtest_explode.reset_index(drop=True),
                pd.json_normalize(self.df_backtest_explode[PredictModel.col_predict_metadata]),
            ],
            axis="columns",
        )

        print(f"correct time: {self.df_backtest_evaluate[PredictModel.col_correct].sum()}")
        print(f"count correct num: {self.df_backtest_evaluate.value_counts(PredictModel.col_correct_num)}")
        print(f"{self.df_backtest_evaluate[self.df_backtest_evaluate['correct_num'] >= 5].to_markdown()}")

    def revenue(self):
        cost = len(self.df_backtest_evaluate) * self.ticket_price
        gain = self.df_backtest_evaluate[PredictModel.col_correct_num].apply(lambda v: self.prices.get(v, 0)).sum()

        return cost, gain, gain - cost


class RandomModel(PredictModel):
    def predict(self, *args, **kwargs):
        import random

        nums = list(range(PredictModel.min_val, PredictModel.max_val))
        random.shuffle(nums)

        return nums[: PredictModel.number_predict]

In [57]:
total_cost = 0
total_gain = 0
for idx in range(10):
    model_random = RandomModel(df, time_predict=10)
    print(f"using model={type(model_random).__name__}")
    model_random.backtest()
    model_random.evaluate()
    cost, gain, revenue = model_random.revenue()
    print(cost, gain, revenue)

    total_cost += cost
    total_gain += gain

print(f"{total_cost:,}, {total_gain:,}")
total_gain - total_cost

using model=RandomModel
correct time: 0
count correct num: correct_num
0    3881
1    3848
2    1293
3     186
4      12
Name: count, dtype: int64
| date   | id   | result   | page   | process_time   | predict_metadata   | predicted_idx   | predicted   | is_correct   | correct_num   |
|--------|------|----------|--------|----------------|--------------------|-----------------|-------------|--------------|---------------|
92200000 15300000 -76900000
using model=RandomModel
correct time: 0
count correct num: correct_num
0    3959
1    3771
2    1298
3     175
4      17
Name: count, dtype: int64
| date   | id   | result   | page   | process_time   | predict_metadata   | predicted_idx   | predicted   | is_correct   | correct_num   |
|--------|------|----------|--------|----------------|--------------------|-----------------|-------------|--------------|---------------|
92200000 17250000 -74950000
using model=RandomModel
correct time: 0
count correct num: correct_num
1    3888
0    3813
2  

29232150000

In [23]:
s

922,000,000, 25,178,300,000


24256300000

In [19]:
pd.json_normalize(model_random.df_backtest_explode[PredictModel.col_predict_metadata])

Unnamed: 0,predicted_idx,predicted,is_correct,correct_num
0,0,"[50, 41, 39, 28, 1, 45]",False,0
1,1,"[34, 20, 40, 54, 13, 41]",False,0
2,2,"[30, 34, 32, 33, 10, 4]",False,1
3,3,"[52, 48, 1, 23, 38, 6]",False,2
4,4,"[38, 45, 47, 28, 21, 49]",False,1
...,...,...,...,...
9215,5,"[41, 3, 20, 43, 33, 50]",False,1
9216,6,"[21, 53, 25, 22, 8, 40]",False,3
9217,7,"[38, 54, 11, 46, 6, 19]",False,1
9218,8,"[18, 44, 19, 38, 8, 31]",False,1


In [20]:
pd.concat(
    [
        model_random.df_backtest_explode.reset_index(drop=True),
        pd.json_normalize(model_random.df_backtest_explode[PredictModel.col_predict_metadata]),
    ],
    axis="columns",
)

Unnamed: 0,date,id,result,page,process_time,predict_metadata,predicted_idx,predicted,is_correct,correct_num
0,2017-08-01,1,"[5, 10, 14, 23, 24, 38, 35]",59,2022-05-07 07:56:43.143266,"{'predicted_idx': 0, 'predicted': [50, 41, 39,...",0,"[50, 41, 39, 28, 1, 45]",False,0
1,2017-08-01,1,"[5, 10, 14, 23, 24, 38, 35]",59,2022-05-07 07:56:43.143266,"{'predicted_idx': 1, 'predicted': [34, 20, 40,...",1,"[34, 20, 40, 54, 13, 41]",False,0
2,2017-08-01,1,"[5, 10, 14, 23, 24, 38, 35]",59,2022-05-07 07:56:43.143266,"{'predicted_idx': 2, 'predicted': [30, 34, 32,...",2,"[30, 34, 32, 33, 10, 4]",False,1
3,2017-08-01,1,"[5, 10, 14, 23, 24, 38, 35]",59,2022-05-07 07:56:43.143266,"{'predicted_idx': 3, 'predicted': [52, 48, 1, ...",3,"[52, 48, 1, 23, 38, 6]",False,2
4,2017-08-01,1,"[5, 10, 14, 23, 24, 38, 35]",59,2022-05-07 07:56:43.143266,"{'predicted_idx': 4, 'predicted': [38, 45, 47,...",4,"[38, 45, 47, 28, 21, 49]",False,1
...,...,...,...,...,...,...,...,...,...,...
9215,2023-08-29,924,"[1, 8, 20, 25, 35, 53, 54]",0,2023-08-31 01:50:31.538585,"{'predicted_idx': 5, 'predicted': [41, 3, 20, ...",5,"[41, 3, 20, 43, 33, 50]",False,1
9216,2023-08-29,924,"[1, 8, 20, 25, 35, 53, 54]",0,2023-08-31 01:50:31.538585,"{'predicted_idx': 6, 'predicted': [21, 53, 25,...",6,"[21, 53, 25, 22, 8, 40]",False,3
9217,2023-08-29,924,"[1, 8, 20, 25, 35, 53, 54]",0,2023-08-31 01:50:31.538585,"{'predicted_idx': 7, 'predicted': [38, 54, 11,...",7,"[38, 54, 11, 46, 6, 19]",False,1
9218,2023-08-29,924,"[1, 8, 20, 25, 35, 53, 54]",0,2023-08-31 01:50:31.538585,"{'predicted_idx': 8, 'predicted': [18, 44, 19,...",8,"[18, 44, 19, 38, 8, 31]",False,1


In [21]:
model_random.df_backtest.explode(PredictModel.col_predict_metadata)
pd.concat(
    [
        model_random.df_backtest_explode,
        pd.json_normalize(model_random.df_backtest_explode[PredictModel.col_predict_metadata]),
    ],
    axis="columns",
)

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

# Lession

Don't gambling, haha