In [10]:
import os
os.environ["OMP_NUM_THREADS"] = "4"
import random
import gc
import re
import math
import hashlib
import datetime
import warnings
import copy

from collections import defaultdict
from multiprocessing import Pool, cpu_count

import tensorflow
import pandas as pd
import lightgbm as lgb
import numpy as np
import keras
from gensim import matutils
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
import scipy.spatial.distance
import nltk
from nltk.cluster import KMeansClusterer
from pandas.util import hash_pandas_object
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt

# Where the downloaded data are
input_path = '/Users/tyamgin/Projects/mlbootcamp/championship21/data'
# Where to store results
output_path = '/Users/tyamgin/Projects/mlbootcamp/championship21/res'

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.


In [34]:
def ppow(a, p):
    return np.where(a >= 0, np.clip(a, 0, a) ** p, -(np.clip(-a, 0, -a) ** p))

In [38]:
def min2(a):
    if hasattr(a, 'values'):
        a = a.values
    if len(a) <= 1:
        return a[0]
    return sorted(a)[1]

def max2(a):
    if hasattr(a, 'values'):
        a = a.values
    if len(a) <= 1:
        return a[0]
    return sorted(a)[-2]

In [175]:
class MutableSettings:
    def __init__(self, scale=1, min_value=None, max_value=None):
        self.scale = scale
        self.min_value = min_value
        self.max_value = max_value
        
class Mutable:
    def __init__(self, settings):
        self.settings = settings

class MutableNumber(Mutable):
    def __init__(self, value, settings):
        self.value = value
        self.settings = settings
    def ensure_ranges(self):
        if self.settings.min_value is not None and self.value < self.settings.min_value:
            self.value = self.settings.min_value
        if self.settings.max_value is not None and self.value > self.settings.max_value:
            self.value = self.settings.max_value
        
class MutableFloat(MutableNumber):
    def mutate(self):
        result = MutableFloat(np.random.normal(self.value, self.settings.scale), self.settings)
        result.ensure_ranges()
        return result
    def crossover_with(self, other):
        result = MutableFloat(np.random.choice([self.value, other.value, (self.value + other.value) / 2]), self.settings)
        result.ensure_ranges()
        return result
    
class MutableFloatFactory:
    def __init__(self, min_value, max_value, start_value=None):
        self.min_value = min_value
        self.max_value = max_value
        self.start_value = start_value
        self.scale = (max_value - min_value) / 15
    def get(self):
        result = self.start_value
        if result is None:
            result = np.random.uniform(self.min_value, self.max_value)
        return MutableFloat(result, MutableSettings(scale=self.scale, min_value=self.min_value, max_value=self.max_value))

class Chromosome:
    def __init__(self):
        self.items = []
    def mutate(self):
        result = Chromosome()
        for x in self.items:
            result.items.append(x)
        l = len(self.items)
        for i in np.random.choice(np.arange(l), size=np.random.randint(l)):
            result.items[i] = result.items[i].mutate()
        return result
    def crossover_with(self, other):
        result = Chromosome()
        for i in range(len(self.items)):
            result.items.append(self.items[i].crossover_with(other.items[i]))
        return result
    def get_values(self):
        return tuple(map(lambda x: x.value, self.items))
        
class Population:
    def __init__(self, size, items, score_getter):
        self.size = size
        self.chromosomes = []
        self.score_getter = score_getter
        self.epochs_count = 0
        for i in range(size):
            chromosome = Chromosome()
            for item in items:
                chromosome.items.append(item.get())
            self.chromosomes.append((chromosome, self.get_score(chromosome)))
        self.print_info()
        
    def get_best_values(self):
        return self.chromosomes[0][0].get_values()
    
    def get_score(self, chromosome):
        return self.score_getter(chromosome.get_values())
        
    def print_info(self):
        scores = map(lambda c: c[1], self.chromosomes)
        #print('Top scores at {} epoch: {}'.format(self.epochs_count, ', '.join(map(str, scores))))
        print('Best values: {} (score={})'.format(', '.join(map(str, self.get_best_values())), self.chromosomes[0][1]))
        
    def get_probabilities(self):
        scores = np.array(list(map(lambda c: c[1], self.chromosomes)))
        scores -= min(scores)
        sm = sum(scores)
        if sm > 0:
            scores /= sm
        scores += 0.03
        return scores / sum(scores)
    
    def do_epoch(self):
        self.epochs_count += 1
        pairs = [self.chromosomes[0]]
        for it in range(2*self.size):
            i, j = np.random.choice(np.arange(len(self.chromosomes)), replace=False, p=self.get_probabilities(), size=2)
            new_chr = self.chromosomes[i][0].crossover_with(self.chromosomes[j][0]).mutate()
            pairs.append((new_chr, self.get_score(new_chr)))

        existing = set()
        pairs.sort(key=lambda x: x[1], reverse=True)
        self.chromosomes = []
        for ch, score in pairs:
            if len(self.chromosomes) >= self.size:
                break
            values = ch.get_values()
            if values not in existing:
                existing.add(values)
                self.chromosomes.append((ch, score))
        self.print_info()
        

In [174]:
def score_getter(x):
    s = 0
    min_diff = 1e10
    for i in range(1, len(x)):
        s += x[i] - x[i - 1]
        min_diff = min(min_diff, x[i] - x[i - 1])
    return s + min_diff
'''
seed_everything(522)
population = Population(size=10, items=[
    MutableFloatFactory(min_value=0, max_value=5),
    MutableFloatFactory(min_value=0, max_value=5),
    MutableFloatFactory(min_value=0, max_value=5),
    MutableFloatFactory(min_value=0, max_value=5),
    MutableFloatFactory(min_value=0, max_value=5),
], score_getter=score_getter)
for i in range(1000):
    population.do_epoch()
'''

Best values: 1.2808941497904573, 4.538959631430654, 1.9014028649706216, 2.5973842697274985, 1.9047711315739062 (score=-2.013679784676584)
Best values: 0.5374850150123528, 1.2152718565674043, 1.340910410257261, 1.9231115866701043, 3.1509599113364577 (score=2.739113450013962)
Best values: 1.05457390336589, 1.4910310379200227, 0.5377166557518407, 2.327988830506929, 4.966487941139377 (score=2.9585996556053047)
Best values: 1.1747114339921239, 0.8396461818110431, 0.5564972694790141, 2.327988830506929, 5 (score=3.4902233138267955)
Best values: 0.0, 1.329674842571441, 1.0974931435318662, 3.7960312198832145, 5 (score=4.767818300960425)
Best values: 0.0, 0.8385399323651481, 1.0590247245397761, 3.7960312198832145, 5.0 (score=5.220484792174628)
Best values: 0.0, 0.9325950445228179, 1.8218556902366942, 4.159873068413741, 5 (score=5.840126931586259)
Best values: 0.0, 0.9325950445228179, 1.8218556902366942, 4.159873068413741, 5 (score=5.840126931586259)
Best values: 0.0, 0.926231392660291, 1.8218556

Best values: 0.0, 1.1527805880714979, 2.3117614834385556, 3.598098927964058, 5 (score=6.152780588071498)
Best values: 0.0, 1.1527805880714979, 2.3117614834385556, 3.598098927964058, 5 (score=6.152780588071498)
Best values: 0.0, 1.1527805880714979, 2.3117614834385556, 3.598098927964058, 5 (score=6.152780588071498)
Best values: 0.0, 1.1527805880714979, 2.3117614834385556, 3.598098927964058, 5 (score=6.152780588071498)
Best values: 0.0, 1.1527805880714979, 2.3117614834385556, 3.598098927964058, 5 (score=6.152780588071498)
Best values: 0.0, 1.3167101175419367, 2.5647140578661043, 3.79327486137395, 5 (score=6.20672513862605)
Best values: 0.0, 1.3167101175419367, 2.5647140578661043, 3.79327486137395, 5 (score=6.20672513862605)
Best values: 0.0, 1.3167101175419367, 2.5647140578661043, 3.79327486137395, 5 (score=6.20672513862605)
Best values: 0.0, 1.3167101175419367, 2.5647140578661043, 3.79327486137395, 5 (score=6.20672513862605)
Best values: 0.0, 1.3167101175419367, 2.5647140578661043, 3.793

Best values: 0, 1.2347453528067174, 2.4698568424557736, 3.7644283282786235, 5.0 (score=6.234745352806717)
Best values: 0, 1.2347453528067174, 2.4698568424557736, 3.7644283282786235, 5.0 (score=6.234745352806717)
Best values: 0, 1.2347453528067174, 2.4698568424557736, 3.7644283282786235, 5.0 (score=6.234745352806717)
Best values: 0, 1.2347453528067174, 2.4698568424557736, 3.7644283282786235, 5.0 (score=6.234745352806717)
Best values: 0, 1.2347453528067174, 2.4698568424557736, 3.7644283282786235, 5.0 (score=6.234745352806717)
Best values: 0, 1.2347453528067174, 2.4698568424557736, 3.7644283282786235, 5.0 (score=6.234745352806717)
Best values: 0, 1.2347453528067174, 2.4698568424557736, 3.7644283282786235, 5.0 (score=6.234745352806717)
Best values: 0, 1.2347453528067174, 2.4698568424557736, 3.7644283282786235, 5.0 (score=6.234745352806717)
Best values: 0, 1.2347453528067174, 2.4698568424557736, 3.7644283282786235, 5.0 (score=6.234745352806717)
Best values: 0, 1.2347453528067174, 2.46985684

Best values: 0, 1.2725355834280137, 2.5216466760396994, 3.7644283282786235, 5.0 (score=6.2355716717213765)
Best values: 0, 1.2725355834280137, 2.5216466760396994, 3.7644283282786235, 5.0 (score=6.2355716717213765)
Best values: 0, 1.2725355834280137, 2.5216466760396994, 3.7644283282786235, 5.0 (score=6.2355716717213765)
Best values: 0, 1.2725355834280137, 2.5216466760396994, 3.7644283282786235, 5.0 (score=6.2355716717213765)
Best values: 0, 1.2725355834280137, 2.5216466760396994, 3.7644283282786235, 5.0 (score=6.2355716717213765)
Best values: 0, 1.2725355834280137, 2.5216466760396994, 3.7644283282786235, 5.0 (score=6.2355716717213765)
Best values: 0, 1.2725355834280137, 2.5216466760396994, 3.7644283282786235, 5.0 (score=6.2355716717213765)
Best values: 0, 1.2725355834280137, 2.5216466760396994, 3.7644283282786235, 5.0 (score=6.2355716717213765)
Best values: 0, 1.2725355834280137, 2.5216466760396994, 3.7644283282786235, 5.0 (score=6.2355716717213765)
Best values: 0, 1.2725355834280137, 2

Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1

Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1

Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1

Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1

Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1

Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1

Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1.2583642469450276, 2.5101813616345043, 3.7551120933810775, 5.0 (score=6.2448879066189225)
Best values: 0.0, 1

In [128]:
5/4

1.25

In [81]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    #torch.manual_seed(seed)
    #torch.cuda.manual_seed(seed)
    #torch.backends.cudnn.deterministic = True
    tensorflow.set_random_seed(seed)

In [94]:
np.random.choice([1,2,3], replace=False, p=[0.4, 0.3, 0.3], size=2)

array([3, 2])

In [97]:
np.arange(5)

array([0, 1, 2, 3, 4])

In [116]:
np.random.choice([0, 1, 2])

1