In [1]:
import pandas as pd
from random import randint, choice
from scipy.stats import binom
from numpy.random import choice
from tqdm.notebook import tqdm

In [2]:
dt = pd.read_csv('forecast2.csv', sep=';')
# fishs = ['щука', 'судак', 'окунь', 'берш', 'речная форель', 'озерная форель', 'елец', 'чехонь', 'сом', 'голавль', 'язь',
#          'карп', 'жерех', 'лещ', 'карась', 'линь', 'пескарь', 'ротан', 'плотва', 'красноперка', 'налим', 'густера',
#          'амур', 'ерш', 'сазан', 'подуст', 'толстолобик', 'вобла', 'хариус']
# dt = dt.rename(columns={fish: fish.capitalize() for fish in fishs})
dt

Unnamed: 0,day_temp,day_pressure,day_obl,day_phen,day_dir,day_wind,areal,city,year,month,...,Красноперка,Налим,Густера,Амур,Ерш,Сазан,Подуст,Толстолобик,Вобла,Хариус
0,-4,749,dull,snow,Ю,3,Алтайский край,Барнаул,2020,1,...,0,1,0,0,0,0,0,0,0,0
1,-1,750,dull,-,Ю,5,Алтайский край,Барнаул,2020,1,...,0,1,0,0,0,0,0,0,0,0
2,-3,749,dull,-,Ю,2,Алтайский край,Барнаул,2020,1,...,0,1,0,0,1,0,0,0,0,0
3,-6,753,dull,-,Ю,1,Алтайский край,Барнаул,2020,1,...,0,1,0,0,0,0,0,0,0,0
4,-2,752,suncl,-,Ю,3,Алтайский край,Барнаул,2020,1,...,0,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30516,-6,752,dull,-,З,1,Московская область,Щелково,2020,12,...,0,0,0,0,1,0,0,0,0,0
30517,-6,759,sunc,-,ЮВ,1,Московская область,Щелково,2020,12,...,0,1,0,0,0,0,0,0,0,0
30518,-5,754,dull,-,ЮВ,3,Московская область,Щелково,2020,12,...,0,0,0,0,1,0,0,0,0,0
30519,-3,753,dull,-,-,0,Московская область,Щелково,2020,12,...,0,0,0,0,1,0,0,0,0,0


In [3]:
class DayGenerator:
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.times = [12, 15, 18, 21, 0, 3, 6, 9]
        self.dirs = {
            'С': {'values': ['СЗ', 'СВ'], 'p': [0.2, 0.3]},
            'СЗ': {'values': ['С', 'З'], 'p': [0.25, 0.25]},
            'З': {'values': ['СЗ', 'ЮЗ'], 'p': [0.2, 0.3]},
            'ЮЗ': {'values': ['З', 'Ю'], 'p': [0.25, 0.25]},
            'Ю': {'values': ['ЮЗ', 'ЮВ'], 'p': [0.3, 0.2]},
            'ЮВ': {'values': ['Ю', 'В'], 'p': [0.2, 0.3]},
            'В': {'values': ['СВ', 'ЮВ'], 'p': [0.3, 0.2]},
            'СВ': {'values': ['С', 'В'], 'p': [0.3, 0.2]},
        }
        
        self.phens = {
            'snow': {'values': ['небольшой снег', 'снег', 'снег с дождём', 'сильный снег', 'мокрый снег'], 'p': [0.35, 0.25, 0.15, 0.1, 0.15]},
            'rain': {'values': ['небольшой дождь', 'дождь', 'сильный дождь'], 'p': [0.45, 0.35, 0.2]},
            'storm': {'values': ['небольшой дождь', 'дождь', 'гроза', 'сильный дождь'], 'p': [0.15, 0.25, 0.3, 0.3]},
        }
        
        self.obl = {
            'dull': {'values': ['пасмурно', 'облачно'], 'p': [0.7, 0.3]},
            'suncl': {'values': ['облачно', 'малооблачно', 'пасмурно'], 'p': [0.6, 0.3, 0.1]},
            'sun': {'values': ['ясно', 'малооблачно'], 'p': [0.7, 0.3]},
            'sunc': {'values': ['малооблачно', 'ясно', 'облачно'], 'p': [0.6, 0.2, 0.2]},
        }
        
        self.fishs = ['Щука', 'Судак', 'Окунь', 'Берш', 'Речная форель', 'Озерная форель', 'Елец', 'Чехонь', 'Сом', 'Голавль', 'Язь',
         'Карп', 'Жерех', 'Лещ', 'Карась', 'Линь', 'Пескарь', 'Ротан', 'Плотва', 'Красноперка', 'Налим', 'Густера',
         'Амур', 'Ерш', 'Сазан', 'Подуст', 'Толстолобик', 'Вобла', 'Хариус']
        
    def __getitem__(self, idx):
        day_priew = self.dataframe.loc[[idx - 1]]
        day = self.dataframe.loc[[idx]]
        day_next = self.dataframe.loc[[idx + 1]]
        pressure = ','.join(map(str, self.gen_pressure(day_priew['day_pressure'].item(), day['day_pressure'].item(), day_next['day_pressure'].item())))
        temperature = ','.join(map(str, self.gen_temperature(day_priew['day_temp'].item(), day['day_temp'].item(), day_next['day_temp'].item())))
        wind_, gust_ = self.gen_wind(day_priew['day_wind'].item(), day['day_wind'].item(), day_next['day_wind'].item())
        wind = ','.join(map(str, wind_))
        gust = ','.join(map(str, gust_))
        wind_direction = ','.join(self.gen_dir(day['day_dir'].item()))
        phenomen_= self.gen_phenomen(day['day_obl'].item(), day['day_phen'].item())
        humidity = ','.join(map(str,self.gen_hum(phenomen_)))
        phenomen = ','.join(phenomen_)
        uv_index = ','.join(map(str,self.gen_uv(day['month'].item())))
        moon_direction, moon = self.gen_moon(sum(day[fish].item() for fish in self.fishs) / len(self.fishs))
        return {
            'pressure': pressure,
            'temperature': temperature,
            'wind': wind,
            'gust': gust,
            'wind_direction': wind_direction,
            'humidity': humidity,
            'phenomenon': phenomen,
            'uv_index': uv_index,
            'moon_direction': moon_direction,
            'moon': moon,
            'month': day['month'].item(),
            'day': day['day'].item(),
            
        }
        
    def dist_(self, left, right, len_):
        is_revert = False
        if left > right:
            left, right = right, left
            is_revert = True
        result = []
        current = left
        for _ in range(len_ - 1): 
            sub = binom.rvs(right - current, 1 / len_)
            current += sub
            result.append(current)
        result.append(right)
        return result[::-1] if is_revert else result
        
    def gen_pressure(self, pressure_priew, pressure, pressure_next):
        pressure_ = {}
        priew_sub = pressure - pressure_priew
        next_sub = pressure_next - pressure
        current_pressure = pressure_priew
        for time in [15, 18, 21]:
            sub = randint(-2, 0) if priew_sub < 0 else randint(-1, 1)
            current_pressure += sub
        dist = self.dist_(current_pressure, pressure, 5)
        for idx, time in enumerate([0, 3, 6, 9, 12]):
            pressure_.update({time: dist[idx]})
        current_pressure = pressure
        for time in [15, 18, 21]:
            sub = randint(-2, 0) if next_sub < 0 else randint(-1, 1)
            current_pressure += sub
            pressure_.update({time: current_pressure})
        return [pressure_[key] for key in sorted(pressure_)]
    
    def gen_wind(self, wind_priew, wind, wind_next):
        wind_ = {}
        priew_sub = wind - wind_priew
        next_sub = wind_next - wind
        current_wind = randint(0, randint(wind_priew, wind)) if priew_sub > 0 else 0
        dist = self.dist_(current_wind, wind, 5)
        for idx, time in enumerate([0, 3, 6, 9, 12]):
            wind_.update({time: dist[idx]})
        current_wind = wind_[12] + randint(-1, 2)
        current_wind = max(0, current_wind)
        wind_.update({15: current_wind})
        current_wind += randint(-1, 2)
        current_wind = max(0, current_wind)
        wind_.update({18: current_wind})
        current_wind += randint(-3, 0)
        current_wind = max(0, current_wind)
        wind_.update({21: current_wind})
        gust_= {time: wind_[time] + randint(3, 8) for time in self.times}
        return [wind_[key] for key in sorted(wind_)], [gust_[key] for key in sorted(gust_)]
    
    def gen_dir(self,direction):
        dir_ = {}
        if direction == '-':
            direction = choice(list(self.dirs.keys()))
        dist = list(choice([direction] + self.dirs[direction]['values'], 8, p=[0.5] + self.dirs[direction]['p']))
        for idx, time in enumerate(self.times):
            dir_.update({time: dist[idx]})
        return [dir_[key] for key in sorted(dir_)]
    
    def gen_temperature(self, temp_priew, temp, temp_next):
        temp_ = {}
        priew_sub = temp - temp_priew
        next_sub = temp_next - temp
        current_temp = temp_priew
        for time in [15, 18, 21]:
            sub = randint(-3, -1) if priew_sub < 0 else randint(-2, 0)
            current_temp += sub
        dist = self.dist_(current_temp, temp, 5)
        for idx, time in enumerate([0, 3, 6, 9, 12]):
            temp_.update({time: dist[idx]})
        current_temp = temp
        for time in [15, 18, 21]:
            sub = randint(-3, -1) if next_sub < 0 else randint(-2, 0)
            current_temp += sub
            temp_.update({time: current_temp})
        return [temp_[key] for key in sorted(temp_)]
    
    def gen_phenomen(self, obl, phen):
        phens_ = {}
        if obl == '-':
            obl = 'sun'
        dist_olb = list(choice(self.obl[obl]['values'], 8, p=self.obl[obl]['p']))
        if phen == '-':
            for idx, time in enumerate(self.times):
                phens_.update({time: dist_olb[idx]})
            return [phens_[key] for key in sorted(phens_)] 
        dist_phens = list(choice(self.phens[phen]['values'], 8, p=self.phens[phen]['p']))
        for idx, time in enumerate(self.times):
            phens_.update({time: '.'.join((dist_olb[idx], dist_phens[idx]))})
        return [phens_[key] for key in sorted(phens_)]
    
    def gen_hum(self, phens):
        hum_ = []
        current_hum = randint(30, 70)
        for phen in phens:
            if 'дождь' in phen:
                current_hum += randint(0, 20)
                current_hum = min(current_hum, randint(93, 99))
            else:
                current_hum += randint(-10, 5)
                current_hum = max(current_hum, randint(10, 29))
            hum_.append(current_hum)
        return hum_
    
    def gen_uv(self, month):
        uv_ = {}
        if month in [12, 1, 2]:
            uv_ = {0: 0, 3: 0, 6: 0, 9: 0, 12: randint(1, 2), 15: 1, 18: 0, 21:0} 
        elif month in [3, 11, 10, 4]:
            uv_ = {0: 0, 3: 0, 6: 0, 9: 1, 12: randint(2, 3), 15: randint(1, 2), 18: 1, 21: 0}
        elif month in [5, 9]:
            uv_ = {0: 0, 3: 0, 6: randint(0, 1), 9: randint(1, 2), 12: randint(2, 4), 15: randint(2, 3), 18: randint(1, 2), 21: randint(0, 1)}
        else:
            uv_ = {0: 0, 3: 0, 6: randint(1, 2), 9: randint(1, 3), 12: randint(2, 4), 15: randint(1, 3), 18: randint(1, 3), 21: randint(1, 2)}
        return [uv_[key] for key in sorted(uv_)]
    
    def gen_moon(self, forecast):
        if forecast > 0.5:
            return 2* randint(0, 1) - 1, randint(35, 70)
        else:
            if randint(0, 1):
                return 2* randint(0, 1) - 1, randint(0, 35)
            else:
                return 2* randint(0, 1) - 1, randint(70, 99)   

In [4]:
ALONE_KEYS = {'time', 'day', 'month', 'humidity', 'uv_index', 'moon', 'moon_direction'}
DIGIT_KEYS = {'temperature', 'wind', 'gust', 'pressure', 'humidity', 'uv_index'}
CATEGORY_KEYS = {'phenomenon', 'wind_direction'}
MOON_KEYS = {'moon', 'moon_direction'}
SUN_KEYS = {'sun_up', 'sun_down'}
WIND_DIRECTIONS = ['Ю', 'ЮЗ', 'З', 'СЗ', 'С', 'СВ', 'В', 'ЮВ']
PHENOMENONS = ['ясно', 'малооблачно', 'облачно', 'пасмурно', 'небольшой дождь', 'дождь', 'сильный дождь',
               'небольшой снег', 'снег', 'снег с дождём', 'сильный снег', 'гроза', 'мокрый снег']
REGIONS = ['Алтайский край', 'Амурская область', 'Архангельская область', 'Астраханская область',
           'Белгородская область', 'Брянская область', 'Владимирская область', 'Волгоградская область',
           'Вологодская область', 'Воронежская область', 'Еврейская автономная область', 'Забайкальский край',
           'Ивановская область', 'Иркутская область', 'Кабардино-Балкарская республика', 'Калининградская область',
           'Калужская область', 'Камчатский край', 'Карачаево-Черкесская республика', 'Кемеровская область',
           'Кировская область', 'Костромская область', 'Краснодарский край', 'Красноярский край', 'Курганская область',
           'Курская область', 'Ленинградская область', 'Липецкая область', 'Магаданская область', 'Московская область']


num_hours = 8
num_days = 3

def preprocess_(data):
    all_data = {}
    for d in data:
        for key in DIGIT_KEYS:
            temp = list(map(int, d[key].split(',')))
            if key in all_data:
                all_data[key] = all_data[key] + temp
            else:
                all_data[key] = temp
        for key in MOON_KEYS:
            temp = [d[key] for _ in range(num_hours)]
            if key in all_data:
                all_data[key] = all_data[key] + temp
            else:
                all_data[key] = temp
        for key in CATEGORY_KEYS:
            if key in all_data:
                all_data[key] = all_data[key] + d[key].split(',')
            else:
                all_data[key] = d[key].split(',')
        days = [d['day'] for _ in range(num_hours)]
        months = [d['month'] for _ in range(num_hours)]
        if 'day' in all_data:
            all_data['day'] = all_data['day'] + days
        else:
            all_data['day'] = days
        if 'month' in all_data:
            all_data['month'] = all_data['month'] + months
        else:
            all_data['month'] = months
        if 'time' in all_data:
            all_data['time'] = all_data['time'] + list(range(0, num_hours * 3, 3))
        else:
            all_data['time'] = list(range(0, num_hours * 3, 3))
    return all_data

def slice_(data, left_bound, righ_bound):
    slice_data = {
        key: data[key][left_bound: righ_bound] for key in data
    }
    return slice_data

def preprocess_batch_(data):
    vec = {}
    for key in data:
        if key in DIGIT_KEYS and not key in ALONE_KEYS:
            for i in range(len(data[key])):
                key_name = '{}_{}'.format(key, i)
                vec.update({key_name: data[key][i]})
        elif key == 'phenomenon':
            phenomenons_ = [_.split('.') for _ in data[key]]
            for phenomenon in PHENOMENONS:
                for i in range(len(phenomenons_)):
                    key_name = '{}_{}'.format(phenomenon, i)
                    vec.update({key_name: int(phenomenon in phenomenons_[i])})
        elif key == 'wind_direction':
            for wind_direction in WIND_DIRECTIONS:
                for i in range(len(data[key])):
                    key_name = '{}_{}'.format(wind_direction, i)
                    vec.update({key_name: int(wind_direction == data[key][i])})
        elif key == 'month':
            for month in range(1, 13):
                key_name = 'month_{}'.format(month)
                vec.update({key_name: int(month == data[key][-1])})
        elif key == 'time':
            for time in range(0, 24, 3):
                key_name = 'time_{}'.format(time)
                vec.update({key_name: int(time == data[key][-1])})
        elif key == 'moon_direction':
            for moon_direction in [-1, 1]:
                key_name = 'moon_direction_{}'.format(moon_direction)
                vec.update({key_name: int(moon_direction == data[key][-1])})
        elif key in ALONE_KEYS:
            key_name = '{}'.format(key)
            vec.update({key_name: data[key][-1]})
    return vec

def gen_forecast(forecast, time, fish):
    if fish in ['Сом', 'Налим']:
        if time == 21:
            return int(forecast) | (randint(1, 100) <= 20)
        elif time in [0, 3]:
            return int(forecast) | (randint(1, 100) <= 30)
        elif time == 6:
            return int(forecast) | (randint(1, 100) <= 20)
        else:
            return int(forecast) * (randint(1, 100) <= 40)
    else:
        if time in [6, 18]:
            return int(forecast) | (randint(1, 100) <= 40)
        elif time == 9:
            return int(forecast) | (randint(1, 100) <= 30)
        elif time == 12:
            return int(forecast) | (randint(1, 100) <= 10)
        elif time == 15:
            return int(forecast)
        else:
            return int(forecast) * (randint(1, 100) <= 30)
            

In [5]:
gen = DayGenerator(dt)

In [6]:
train_data = []
for idx, row in tqdm(dt.iterrows()):
    if idx <= 3:
        continue
    if row['Щука'] > -1:
        data = dt[idx - 3: idx + 1]
        day = gen[idx]
        day_1 = gen[idx - 1]
        day_2 = gen[idx - 2]
        day_3 = gen[idx - 3]
        all_data = preprocess_([day_3, day_2, day_1, day])
#         print(all_data)
#         break
        len_data = len(all_data['moon'])
        probs = {fish: [] for fish in gen.fishs}
        for fish in gen.fishs:
            for i in range(num_hours * num_days, len_data + 1):
                slice_data = slice_(all_data, i - num_hours * num_days, i)
                vec = preprocess_batch_(slice_data)
                for fish_ in gen.fishs:
                    vec.update({fish_: int(fish_ == fish)})
                time = 0
                for j in range(0, 24, 3):
                    if vec['time_{}'.format(j)] == 1:
                        time = j
                        break
                vec = {key: vec[key] for key in sorted(vec)}
                forecast = gen_forecast(row[fish], time, fish)
                vec.update({'forecast': forecast})
                train_data.append(vec)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

KeyboardInterrupt: 

In [7]:
len(train_data)

100908

In [8]:
train_dt = pd.DataFrame(train_data)

In [None]:
fishs_dt = [train_dt[train_dt[fish.capitalize()] == 1] for fish in fishs]

In [None]:
y_means = {fishs[i]: fishs_dt[i]['forecast'].mean() for i in range(len(fishs))}
y_means

In [None]:
train_dt.describe()

In [None]:
# f1 = train_dt[train_dt['forecast'] == 1]
# f0 = train_dt[train_dt['forecast'] == 0].sample(len(f1))
# train_dt = pd.concat([f0, f1])

In [9]:
train_dt = train_dt.sample(frac=1.0).reset_index(drop=True)

In [10]:
y = train_dt['forecast']
del train_dt['forecast']

In [11]:
y.std()

0.47901899032878875

In [12]:
train_dt.describe()

Unnamed: 0,day,gust_0,gust_1,gust_10,gust_11,gust_12,gust_13,gust_14,gust_15,gust_16,...,ясно_21,ясно_22,ясно_23,ясно_3,ясно_4,ясно_5,ясно_6,ясно_7,ясно_8,ясно_9
count,100908.0,100908.0,100908.0,100908.0,100908.0,100908.0,100908.0,100908.0,100908.0,100908.0,...,100908.0,100908.0,100908.0,100908.0,100908.0,100908.0,100908.0,100908.0,100908.0,100908.0
mean,15.738257,7.539422,7.573304,7.729308,7.754806,7.849358,7.901267,7.940996,7.827229,7.647362,...,0.184643,0.188667,0.186834,0.18939,0.187091,0.185654,0.184217,0.185079,0.186804,0.186804
std,8.697508,2.649403,2.652488,2.597937,2.608528,2.656629,2.690258,2.720205,2.687274,2.64734,...,0.38801,0.391246,0.38978,0.39182,0.389987,0.388829,0.387663,0.388364,0.389756,0.389756
min,1.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,16.0,7.0,7.0,8.0,8.0,8.0,8.0,8.0,8.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,23.0,9.0,9.0,9.0,9.0,9.0,10.0,10.0,9.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,31.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
X = train_dt.values

In [14]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [15]:
X_train.shape

(80726, 655)

In [16]:
X_test.shape

(20182, 655)

In [17]:
from catboost import CatBoostClassifier

model = CatBoostClassifier()

In [18]:
model.fit(X_train, y_train)

Learning rate set to 0.067178
0:	learn: 0.6698418	total: 270ms	remaining: 4m 29s
1:	learn: 0.6482569	total: 324ms	remaining: 2m 41s
2:	learn: 0.6293769	total: 376ms	remaining: 2m 4s
3:	learn: 0.6131838	total: 428ms	remaining: 1m 46s
4:	learn: 0.5999637	total: 481ms	remaining: 1m 35s
5:	learn: 0.5890073	total: 539ms	remaining: 1m 29s
6:	learn: 0.5785667	total: 588ms	remaining: 1m 23s
7:	learn: 0.5696092	total: 641ms	remaining: 1m 19s
8:	learn: 0.5609374	total: 702ms	remaining: 1m 17s
9:	learn: 0.5540723	total: 764ms	remaining: 1m 15s
10:	learn: 0.5479415	total: 811ms	remaining: 1m 12s
11:	learn: 0.5413949	total: 866ms	remaining: 1m 11s
12:	learn: 0.5363292	total: 920ms	remaining: 1m 9s
13:	learn: 0.5307909	total: 979ms	remaining: 1m 8s
14:	learn: 0.5265812	total: 1.04s	remaining: 1m 8s
15:	learn: 0.5219929	total: 1.1s	remaining: 1m 7s
16:	learn: 0.5178783	total: 1.16s	remaining: 1m 6s
17:	learn: 0.5151143	total: 1.22s	remaining: 1m 6s
18:	learn: 0.5120361	total: 1.27s	remaining: 1m 5s
1

161:	learn: 0.4339307	total: 8.93s	remaining: 46.2s
162:	learn: 0.4337034	total: 8.98s	remaining: 46.1s
163:	learn: 0.4334827	total: 9.04s	remaining: 46.1s
164:	learn: 0.4333496	total: 9.09s	remaining: 46s
165:	learn: 0.4330801	total: 9.13s	remaining: 45.9s
166:	learn: 0.4328185	total: 9.19s	remaining: 45.8s
167:	learn: 0.4325493	total: 9.24s	remaining: 45.8s
168:	learn: 0.4323227	total: 9.29s	remaining: 45.7s
169:	learn: 0.4321046	total: 9.34s	remaining: 45.6s
170:	learn: 0.4319447	total: 9.39s	remaining: 45.5s
171:	learn: 0.4316835	total: 9.43s	remaining: 45.4s
172:	learn: 0.4314684	total: 9.49s	remaining: 45.4s
173:	learn: 0.4311940	total: 9.54s	remaining: 45.3s
174:	learn: 0.4309627	total: 9.59s	remaining: 45.2s
175:	learn: 0.4307075	total: 9.65s	remaining: 45.2s
176:	learn: 0.4304624	total: 9.7s	remaining: 45.1s
177:	learn: 0.4302589	total: 9.75s	remaining: 45s
178:	learn: 0.4299663	total: 9.81s	remaining: 45s
179:	learn: 0.4297245	total: 9.87s	remaining: 44.9s
180:	learn: 0.42936

322:	learn: 0.4043512	total: 17.3s	remaining: 36.3s
323:	learn: 0.4042538	total: 17.4s	remaining: 36.3s
324:	learn: 0.4041637	total: 17.4s	remaining: 36.2s
325:	learn: 0.4040532	total: 17.5s	remaining: 36.2s
326:	learn: 0.4038899	total: 17.6s	remaining: 36.1s
327:	learn: 0.4037669	total: 17.6s	remaining: 36.1s
328:	learn: 0.4036418	total: 17.7s	remaining: 36s
329:	learn: 0.4035227	total: 17.7s	remaining: 36s
330:	learn: 0.4033998	total: 17.8s	remaining: 35.9s
331:	learn: 0.4032898	total: 17.8s	remaining: 35.9s
332:	learn: 0.4031649	total: 17.9s	remaining: 35.8s
333:	learn: 0.4030323	total: 17.9s	remaining: 35.8s
334:	learn: 0.4029195	total: 18s	remaining: 35.7s
335:	learn: 0.4027509	total: 18.1s	remaining: 35.7s
336:	learn: 0.4026176	total: 18.1s	remaining: 35.6s
337:	learn: 0.4024762	total: 18.2s	remaining: 35.6s
338:	learn: 0.4023370	total: 18.2s	remaining: 35.5s
339:	learn: 0.4022324	total: 18.3s	remaining: 35.4s
340:	learn: 0.4020697	total: 18.3s	remaining: 35.4s
341:	learn: 0.4019

481:	learn: 0.3865500	total: 25.2s	remaining: 27.1s
482:	learn: 0.3864497	total: 25.3s	remaining: 27.1s
483:	learn: 0.3863604	total: 25.3s	remaining: 27s
484:	learn: 0.3862436	total: 25.4s	remaining: 26.9s
485:	learn: 0.3861381	total: 25.4s	remaining: 26.9s
486:	learn: 0.3860261	total: 25.5s	remaining: 26.8s
487:	learn: 0.3859302	total: 25.5s	remaining: 26.8s
488:	learn: 0.3858459	total: 25.6s	remaining: 26.7s
489:	learn: 0.3857357	total: 25.6s	remaining: 26.7s
490:	learn: 0.3856390	total: 25.7s	remaining: 26.6s
491:	learn: 0.3855495	total: 25.7s	remaining: 26.5s
492:	learn: 0.3854447	total: 25.7s	remaining: 26.5s
493:	learn: 0.3853294	total: 25.8s	remaining: 26.4s
494:	learn: 0.3852184	total: 25.8s	remaining: 26.4s
495:	learn: 0.3851402	total: 25.9s	remaining: 26.3s
496:	learn: 0.3849641	total: 25.9s	remaining: 26.2s
497:	learn: 0.3848680	total: 26s	remaining: 26.2s
498:	learn: 0.3847644	total: 26s	remaining: 26.1s
499:	learn: 0.3846744	total: 26.1s	remaining: 26.1s
500:	learn: 0.3845

642:	learn: 0.3718223	total: 32.8s	remaining: 18.2s
643:	learn: 0.3717461	total: 32.8s	remaining: 18.2s
644:	learn: 0.3716585	total: 32.9s	remaining: 18.1s
645:	learn: 0.3715658	total: 32.9s	remaining: 18.1s
646:	learn: 0.3714589	total: 33s	remaining: 18s
647:	learn: 0.3713986	total: 33s	remaining: 17.9s
648:	learn: 0.3712662	total: 33.1s	remaining: 17.9s
649:	learn: 0.3712024	total: 33.1s	remaining: 17.8s
650:	learn: 0.3711200	total: 33.2s	remaining: 17.8s
651:	learn: 0.3710109	total: 33.2s	remaining: 17.7s
652:	learn: 0.3709276	total: 33.3s	remaining: 17.7s
653:	learn: 0.3708615	total: 33.3s	remaining: 17.6s
654:	learn: 0.3707846	total: 33.4s	remaining: 17.6s
655:	learn: 0.3706956	total: 33.4s	remaining: 17.5s
656:	learn: 0.3706227	total: 33.5s	remaining: 17.5s
657:	learn: 0.3705407	total: 33.5s	remaining: 17.4s
658:	learn: 0.3704599	total: 33.6s	remaining: 17.4s
659:	learn: 0.3703840	total: 33.6s	remaining: 17.3s
660:	learn: 0.3702980	total: 33.7s	remaining: 17.3s
661:	learn: 0.3702

803:	learn: 0.3593431	total: 40.2s	remaining: 9.81s
804:	learn: 0.3592847	total: 40.3s	remaining: 9.76s
805:	learn: 0.3591936	total: 40.3s	remaining: 9.7s
806:	learn: 0.3591269	total: 40.4s	remaining: 9.65s
807:	learn: 0.3590503	total: 40.4s	remaining: 9.6s
808:	learn: 0.3589744	total: 40.5s	remaining: 9.55s
809:	learn: 0.3589001	total: 40.5s	remaining: 9.5s
810:	learn: 0.3588237	total: 40.5s	remaining: 9.45s
811:	learn: 0.3587299	total: 40.6s	remaining: 9.4s
812:	learn: 0.3586507	total: 40.6s	remaining: 9.35s
813:	learn: 0.3586064	total: 40.7s	remaining: 9.29s
814:	learn: 0.3585499	total: 40.7s	remaining: 9.24s
815:	learn: 0.3584547	total: 40.8s	remaining: 9.2s
816:	learn: 0.3583766	total: 40.8s	remaining: 9.15s
817:	learn: 0.3582789	total: 40.9s	remaining: 9.1s
818:	learn: 0.3581924	total: 41s	remaining: 9.05s
819:	learn: 0.3581315	total: 41s	remaining: 9s
820:	learn: 0.3580576	total: 41s	remaining: 8.95s
821:	learn: 0.3579813	total: 41.1s	remaining: 8.9s
822:	learn: 0.3579132	total:

965:	learn: 0.3480660	total: 47.9s	remaining: 1.69s
966:	learn: 0.3479843	total: 47.9s	remaining: 1.64s
967:	learn: 0.3479281	total: 48s	remaining: 1.59s
968:	learn: 0.3478551	total: 48s	remaining: 1.54s
969:	learn: 0.3477809	total: 48.1s	remaining: 1.49s
970:	learn: 0.3476779	total: 48.1s	remaining: 1.44s
971:	learn: 0.3476139	total: 48.2s	remaining: 1.39s
972:	learn: 0.3475602	total: 48.2s	remaining: 1.34s
973:	learn: 0.3475018	total: 48.3s	remaining: 1.29s
974:	learn: 0.3474454	total: 48.3s	remaining: 1.24s
975:	learn: 0.3473616	total: 48.4s	remaining: 1.19s
976:	learn: 0.3473023	total: 48.4s	remaining: 1.14s
977:	learn: 0.3472413	total: 48.5s	remaining: 1.09s
978:	learn: 0.3471717	total: 48.5s	remaining: 1.04s
979:	learn: 0.3471170	total: 48.6s	remaining: 991ms
980:	learn: 0.3470777	total: 48.6s	remaining: 941ms
981:	learn: 0.3470044	total: 48.6s	remaining: 892ms
982:	learn: 0.3469323	total: 48.7s	remaining: 842ms
983:	learn: 0.3468875	total: 48.7s	remaining: 792ms
984:	learn: 0.34

<catboost.core.CatBoostClassifier at 0x2181c744c48>

In [19]:
preds = model.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score

print(accuracy_score(preds, y_test))

0.8203845010405312


In [21]:
model.predict_proba(X_test)

array([[0.94450721, 0.05549279],
       [0.78951024, 0.21048976],
       [0.93608497, 0.06391503],
       ...,
       [0.653385  , 0.346615  ],
       [0.96485846, 0.03514154],
       [0.73858378, 0.26141622]])

In [22]:
from joblib import dump

In [23]:
dump(model, 'catboost_0.5.model')

['catboost_0.5.model']

In [26]:
y_train.mean()

0.3564403042390308