In [1]:
import numpy as np
import pandas as pd

pd.options.display.max_columns = 100

In [2]:
data = pd.read_csv('input/train.csv')

In [3]:
data.shape

(10000, 20)

In [4]:
data.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


### train_test_split

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
train, valid = train_test_split(data, test_size=0.3, random_state=42)

In [7]:
train.shape, valid.shape

((7000, 20), (3000, 20))

### Prepare train, valid

In [8]:
def add_info1(df, df_source, feat):
    flat_qty = 'flat_qty_'+feat
    distr_info1 = df_source[feat].value_counts(normalize=True).reset_index().\
        rename(columns={'index':feat, feat:flat_qty})
    
    df = pd.merge(df, distr_info1, on=feat, how='left')
    df[flat_qty] = df[flat_qty].fillna(df[flat_qty].min())
    return df

In [9]:
distr_stat_dr = train.groupby(['DistrictId', 'Rooms'], as_index=False)[['Price']].mean().\
    rename(columns={'Price':'mean_price_dr'})
    
distr_stat_r = train.groupby(['Rooms'], as_index=False)[['Price']].mean().rename(columns={'Price':'mean_price_r'})

mean_price = train['Price'].mean()

In [10]:
def add_stats(df, distr_stat_dr, distr_stat_r, mean_price):
    df = pd.merge(df, distr_stat_dr, on=['DistrictId', 'Rooms'], how='left')
    df = pd.merge(df, distr_stat_r, on='Rooms', how='left')
    
    df['mean_price_r'] = df['mean_price_r'].fillna(mean_price)
    df['mean_price_dr'] = df['mean_price_dr'].fillna(df['mean_price_r'])
    return df

In [11]:
def add_cat_fts(df, cat_fts=('Ecology_2', 'Ecology_3', 'Shops_2')):
    for col in cat_fts:
        df[col] = (df[col] == 'B').astype(int)
    return df

In [12]:
def fillna_healthcare_1(df):
    df['Healthcare_1'] = df['Healthcare_1'].fillna(0)
    return df

In [13]:
def prepare_data(df, df_source, distr_stat_dr, distr_stat_r, mean_price):
    df = add_info1(df, df_source, 'DistrictId')
    df = add_stats(df, distr_stat_dr, distr_stat_r, mean_price)
    df = add_cat_fts(df)
    df = fillna_healthcare_1(df)
    return df

In [14]:
train = prepare_data(train, train, distr_stat_dr, distr_stat_r, mean_price)

In [15]:
train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price,flat_qty_DistrictId,mean_price_dr,mean_price_r
0,14604,23,1.0,41.68138,22.796166,8.0,14,17.0,2015,0.075779,1,1,6,1437,3,0.0,0,2,1,88504.384965,0.056286,102427.030975,160134.810901
1,5621,23,3.0,163.495333,161.504222,12.0,5,3.0,1977,0.014073,1,1,2,475,0,0.0,0,0,1,207007.956663,0.056286,165911.1297,290867.452543
2,235,87,1.0,39.710131,19.538663,8.0,4,17.0,1986,0.100456,1,1,43,7227,0,0.0,1,6,0,182126.280899,0.003,169596.630515,160134.810901
3,16258,48,3.0,96.056784,98.152802,1.0,15,1.0,2017,0.041125,1,1,46,9515,5,0.0,1,10,1,524365.550705,0.008857,382424.639356,290867.452543
4,10773,77,3.0,79.195825,44.421062,10.0,16,17.0,1984,0.298205,1,1,16,4048,3,0.0,1,3,1,322048.43399,0.004,251751.766701,290867.452543


In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7000 entries, 0 to 6999
Data columns (total 23 columns):
Id                     7000 non-null int64
DistrictId             7000 non-null int64
Rooms                  7000 non-null float64
Square                 7000 non-null float64
LifeSquare             5514 non-null float64
KitchenSquare          7000 non-null float64
Floor                  7000 non-null int64
HouseFloor             7000 non-null float64
HouseYear              7000 non-null int64
Ecology_1              7000 non-null float64
Ecology_2              7000 non-null int32
Ecology_3              7000 non-null int32
Social_1               7000 non-null int64
Social_2               7000 non-null int64
Social_3               7000 non-null int64
Healthcare_1           7000 non-null float64
Helthcare_2            7000 non-null int64
Shops_1                7000 non-null int64
Shops_2                7000 non-null int32
Price                  7000 non-null float64
flat_qty_District

In [17]:
valid = prepare_data(valid, train, distr_stat_dr, distr_stat_r, mean_price)

### Model

In [18]:
train.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2',
       'Ecology_3', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1',
       'Helthcare_2', 'Shops_1', 'Shops_2', 'Price', 'flat_qty_DistrictId',
       'mean_price_dr', 'mean_price_r'],
      dtype='object')

In [None]:
feats = ['Rooms', 'Square', 'flat_qty_DistrictId', 'mean_price_dr', 'Helthcare_2', 'Healthcare_1', 'HouseYear']
#feats = ['Id', 'Square', 'Floor', 'HouseFloor', 'Ecology_3', 'Social_1', 'Social_2', 'Healthcare_1', 'Shops_1', 'Price']

In [None]:
# ['Id',
#  'DistrictId',
#  'Rooms',
#  'Square',
#  'LifeSquare',
#  'KitchenSquare',
#  'Floor',
#  'HouseFloor',
#  'HouseYear',
#  'Ecology_1',
#  'Ecology_2',
#  'Ecology_3',
#  'Social_1',
#  'Social_2',
#  'Social_3',
#  'Healthcare_1',
#  'Helthcare_2',
#  'Shops_1',
#  'Shops_2',
#  'Price',
#  'flat_qty_distr',
# 'mean_price_dr',
#  'mean_price_r'],
# [0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1]
# feats = []

In [None]:
from sklearn.ensemble import RandomForestRegressor as RF

In [None]:
# ?RF

In [None]:
model = RF(n_estimators=300, max_depth=12, random_state=42, max_features=4, min_samples_leaf=2)

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7000 entries, 0 to 6999
Data columns (total 23 columns):
Id                     7000 non-null int64
DistrictId             7000 non-null int64
Rooms                  7000 non-null float64
Square                 7000 non-null float64
LifeSquare             5514 non-null float64
KitchenSquare          7000 non-null float64
Floor                  7000 non-null int64
HouseFloor             7000 non-null float64
HouseYear              7000 non-null int64
Ecology_1              7000 non-null float64
Ecology_2              7000 non-null int32
Ecology_3              7000 non-null int32
Social_1               7000 non-null int64
Social_2               7000 non-null int64
Social_3               7000 non-null int64
Healthcare_1           7000 non-null float64
Helthcare_2            7000 non-null int64
Shops_1                7000 non-null int64
Shops_2                7000 non-null int32
Price                  7000 non-null float64
flat_qty_District

In [None]:
model.fit(train.loc[:, feats], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features=4, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=2,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=300, n_jobs=None, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [None]:
pred_train = model.predict(train.loc[:, feats])

In [None]:
pred_train.shape

(7000,)

In [None]:
pred_train

array([ 93557.50779231, 193197.38917049, 175801.54000418, ...,
       240602.56681924, 204675.32818292, 399193.50580112])

In [None]:
pred_valid = model.predict(valid.loc[:, feats])

In [None]:
pred_valid.shape

(3000,)

In [None]:
pred_valid

array([188078.2573157 , 337788.61879459, 217087.7267442 , ...,
       274195.46828096, 112673.01281904, 260862.25254602])

### Evaluate model

In [None]:
from sklearn.metrics import r2_score as r2

In [None]:
r2(train['Price'], pred_train)

0.8978579318090508

In [None]:
r2(valid['Price'], pred_valid)

0.683087842651711

### Test

In [None]:
# ?prepare_data

In [None]:
test = pd.read_csv('input/test.csv')

In [None]:
test = prepare_data(test, train, distr_stat_dr, distr_stat_r, mean_price)

In [None]:
test['Price'] = model.predict(test.loc[:, feats])

In [None]:
test.loc[:, ['Id', 'Price']].to_csv('AAnonymous_predictions.csv', index=None)

In [None]:
import random

In [None]:
columns = list(train.columns)
columns.remove('Id')
columns.remove('LifeSquare')
columns.remove('Price')

In [None]:
from itertools import compress

def filt(feat_genes):
    return list(compress(columns, feat_genes))

In [None]:
len(train.columns)

23

In [None]:
feat_genes = [random.choice([True, False]) for _ in range(len(columns))]
feat_genes

[False,
 True,
 True,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 True,
 True,
 False,
 False,
 False]

In [None]:
import random
from concurrent.futures import ProcessPoolExecutor

class Chromosome:
    def __init__(self, generate=False):
        if generate:
            self._generate()
        self.fitness = 0
        self.done = False
        self.just_done = False
        self.mutated = False
        
    @property
    def feats_genes(self):
        return self.genes[:len(columns)]

    @property
    def other_genes(self):
        return self.genes[len(columns):]

    def _generate(self):
        self.genes = [random.choice([True, False]) for _ in range(len(columns))]
#         n_estimators = 30
#         max_depth = 12
#         max_features = 4
#         min_samples_leaf = 2

        n_estimators = 100
#         max_depth = random.randint(8, 18)
#         max_features = random.randint(3, len(columns))
#         min_samples_leaf = random.randint(1, 5)
        
        
        
        
        max_depth = random.randint(8, 18)
        min_samples_split = random.randint(1, 10)
        min_samples_leaf = random.randint(1, 10)
        min_weight_fraction_leaf = random.uniform(0, 1)
        max_features = random.choice([None, 'sqrt', 'log2', random.randint(3, len(columns))])
        max_leaf_nodes = random.randint(1, 10)
        min_impurity_decrease = random.uniform(0, 5)
        
        self.genes.append(n_estimators)
        self.genes.append(max_depth)
        self.genes.append(min_samples_split)
        self.genes.append(min_samples_leaf)
        self.genes.append(min_weight_fraction_leaf)
        self.genes.append(max_features)
        self.genes.append(max_leaf_nodes)
        self.genes.append(min_impurity_decrease)

    def set_fitness(self):
        if self.done:
            self.just_done = False
        else:
            feats_genes = filt(self.genes)
            #feats_gens = ['Rooms', 'Square', 'flat_qty_distr', 'mean_price_dr', 'Helthcare_2', 'Healthcare_1', 'HouseYear']
            #print(feats_gens)
            try:
                rf = RF(n_estimators=self.other_genes[0],
#                         max_depth=self.other_genes[1],
#                         max_features=self.other_genes[2],
#                         min_samples_leaf=self.other_genes[3],
                        
                        max_depth = self.other_genes[1],
                     #   min_samples_split = self.other_genes[2],
                        min_samples_leaf = self.other_genes[3],
                     #   min_weight_fraction_leaf = self.other_genes[4],
                        max_features = self.other_genes[5],
                      #  max_leaf_nodes = self.other_genes[6],
                      #  min_impurity_decrease = self.other_genes[7],
                        
                        random_state=42,
                        n_jobs=-1)
                
                rf.fit(train.loc[:, feats_genes], train['Price'])
                pred_valid = rf.predict(valid.loc[:, feats_genes])
                self.fitness = r2(valid['Price'], pred_valid)
            except:
                self.fitness = 0
            self.done = self.just_done = True

    @property
    def feats_genes_str(self):
        return [int(b) for b in self.feats_genes].__str__()

    @property
    def other_genes_str(self):
        return ['{0:0.2f}'.format(i) if type(i) is float else i for i in self.other_genes]
    
    def __str__(self):
        just_done = '*' if self.just_done else ' '
        mutated = 'm' if self.mutated else ' '
        return f'{100*self.fitness:6.2f}% {self.feats_genes_str} {self.other_genes_str} {just_done} {mutated}'


def cross(chro1, chro2):
    genes_len = len(chro1.genes)
    r1 = random.randint(0, genes_len - 1)
    r2 = random.randint(r1 + 1, genes_len)

    child1 = Chromosome()
    child2 = Chromosome()

    child1.genes = chro1.genes[:r1] + chro2.genes[r1:r2] + chro1.genes[r2:]
    child2.genes = chro2.genes[:r1] + chro1.genes[r1:r2] + chro2.genes[r2:]

    return child1, child2


def ftns(chro):
    print('---')
    chro.set_fitness()

class Ga:
    def __init__(self, size):
        self.generate(size)

    def generate(self, size):
        self.generation = [Chromosome(True) for _ in range(size)]
        self.set_fitnesses()
        self._sort()
        self.iteration = 1

    def next(self):
        end = len(self.generation)
        middle = end // 2
        gener = self.generation
        for i in range(middle, end, 2):
            parent1, parent2 = self._random_choice(middle)
            gener[i], gener[i + 1] = cross(gener[parent1], gener[parent2])
        self.set_fitnesses()
        self._sort()
        self.iteration += 1

    def set_fitnesses(self):
        j = 0
        for chro in self.generation:
            chro.set_fitness()
            j += 1
            print(j, end=' ')
        print()
        
#         with ProcessPoolExecutor(max_workers=4) as executor:
#             self.generation = list(executor.map(ftns, self.generation))
            
#         executor = ProcessPoolExecutor(max_workers=3)
#         task1 = executor.submit(task)
#         task2 = executor.submit(task)
            
    def _random_choice(self, n):
        x = [i for i in range(n)]
        prob = [i for i in range(n, 0, -1)]
        sum_ = sum(prob)
        for i in range(len(prob)):
            prob[i] /= sum_
        return np.random.choice(n, 2, replace=False)
            
            
    def _sort(self):
        self.generation = sorted(self.generation, key=lambda x: x.fitness, reverse=True)

    def __str__(self):
        ret = f'================== {self.iteration:3} ==================\n'
        output_top = 50
        if len(self.generation) < output_top:
            output_top = len(self.generation)

        for i in range(output_top):
            ret += f'{(i + 1): 3})'
            ret += self.generation[i].__str__()
            ret += '\n'
        return ret


POPULATION = 100
ga = Ga(POPULATION)
print(ga)

for i in range(1, 1000):
    ga.next()
    print(ga)

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 

In [None]:
z = 0.01234
f'{100*z:6.2}%'

In [None]:
x = [i for i in range(3)]
prob = [i for i in range(3, 0, -1)]
print(prob)
sum_ = sum(prob)
for i in range(len(prob)):
    prob[i] /= sum_
# np.random.choice()
prob, sum(prob)
print(prob)
zz = []
for i in range(10000):
    zz.append(np.random.choice(x, p=prob))
pd.Series(zz).value_counts(normalize=True)

In [None]:
# ?np.random.choice

In [None]:
#Cell[StyleData["Print"], FontSize->24, FontColor->RGBColor[1, 0, 0]]

type(0.1) is float