In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [2]:
import sys
sys.path.insert(0, 'basic_codes')

import class_magnesium_not_drop_na #_transform_pairings
from class_magnesium_not_drop_na import *

from sklearn.preprocessing import LabelEncoder

In [3]:
fold = 'input/'
file_train = 'train.csv'
file_test = 'test.csv'

In [4]:
def number_encode_features(init_df):
    result = init_df.copy()
    encoders = {}
    for column in result.columns:
        if result.dtypes[column] == np.object:
            encoders[column] = LabelEncoder()
            result[column] = encoders[column].fit_transform(result[column])
    return result, encoders

In [5]:
def load_data(filename, test=True):
    '''
        Функция для загрузки датасета без инициализации класса. 
        
        Возвращает:
         data - полный DataFrame
         fetures - список признаков для обучения/предсказания
         x - датасет для тренировки/предсказания в формате np.array
         y - целевой признак, если он есть. В случае с тестовым датасетом возвращает None
    '''
    if test:
        data = pd.read_table(filename, sep=',').dropna()
    else:
        data = pd.read_table(filename).dropna()
    data = data[~(data['chainlen']>1000)]
#     data = number_encode_features(data)[0]
    if ('DSSR' in data.columns):
        data.drop('DSSR', axis=1, inplace=True)    
    
    features = list(deepcopy(data.columns))
    
    [features.remove(column) for column in ['Id','index', 'mg', 'pdb_chain'] if column in data.columns];
#     x_test = np.array(data[features])
    x_test = data[features]
    
    try:
        y_test = np.array(data['mg'])
    except: 
        y_test = None
    change_output('Data loaded')
    return {'data':data, 'features':features, 'x':x_test, 'y':y_test}

In [6]:
train = load_data(fold + file_train, test=False)

Data loaded

In [7]:
test = load_data(fold + file_test, test=True)

Data loaded

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    train["x"],
    train["y"],
    test_size=0.2,
    random_state=42)
X_train.head()

Unnamed: 0,xray,resol,chainlen,protein,alpham2,betam2,gammam2,deltam2,epsilonm2,zetam2,...,atomO2,atomO2p,atomO3p,atomO4,atomO4p,atomO5p,atomO6,atomOP1,atomOP2,atomOP3
113531,1,3.05,172,0,-56.0,168.8,61.0,78.4,-154.7,-18.6,...,0,0,0,0,0,1,0,0,0,0
116872,1,2.35,55,0,-70.3,179.3,53.5,79.7,-92.9,-101.6,...,0,0,1,0,0,0,0,0,0,0
118155,1,3.02,119,0,-63.2,-179.7,53.1,78.1,-139.6,-73.7,...,0,0,1,0,0,0,0,0,0,0
150345,1,2.801,158,1,-146.4,170.8,175.3,146.6,-90.3,75.9,...,0,0,1,0,0,0,0,0,0,0
210059,0,3.4,95,1,-55.2,166.9,49.5,77.5,-151.1,-79.4,...,0,0,0,0,0,0,0,1,0,0


In [10]:
X_test.head()

Unnamed: 0,xray,resol,chainlen,protein,alpham2,betam2,gammam2,deltam2,epsilonm2,zetam2,...,atomO2,atomO2p,atomO3p,atomO4,atomO4p,atomO5p,atomO6,atomOP1,atomOP2,atomOP3
150726,1,2.801,158,1,142.6,-165.8,-163.9,85.8,-149.2,-64.8,...,0,0,0,0,0,0,0,0,0,0
161151,1,3.118,119,1,-51.3,140.7,70.0,79.6,-153.3,-77.4,...,0,0,0,0,0,0,0,0,0,0
114399,1,2.28,77,0,-65.7,-179.2,51.3,82.1,-145.5,-79.6,...,0,0,0,0,0,0,0,0,0,0
118663,1,2.186,84,1,80.9,137.2,44.8,148.3,-93.7,-77.8,...,0,0,1,0,0,0,0,0,0,0
105359,1,2.212,16,0,-51.5,165.3,44.5,84.8,-147.4,-74.3,...,0,0,1,0,0,0,0,0,0,0


In [11]:

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [12]:
model = xgb.XGBClassifier(nthread=30, n_jobs=10, 
                          max_depth=5, learning_rate=0.3, n_estimators=350, scale_pos_weight=5)

In [13]:
model = model.fit(X_train, y_train)
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.3, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=350,
       n_jobs=10, nthread=30, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=5, seed=None,
       silent=True, subsample=1)

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import f1_score
print(f1_score(y_test, y_pred))

In [23]:
feat = model.feature_importances_
feat_pred =feat > 0.01

In [24]:
X_train = X_train[:, feat_pred]
X_test = X_test[: , feat_pred]

array([0.02763749, 0.01683502, 0.02146465, 0.01992144, 0.01599327,
       0.01262626, 0.01038159, 0.0102413 , 0.01010101, 0.0102413 ,
       0.01080247, 0.01052189, 0.01010101, 0.01080247, 0.02132436,
       0.01725589, 0.01262626], dtype=float32)

In [18]:
def out_file(y_pred, model_name="xgb"):
    with open("out_{}.txt".format(model_name), "w") as out_f:
        print("Id,mg", file=out_f)
        for i, ans in enumerate(y_pred):
            print("{},{}".format(i, ans), file=out_f)

In [19]:
def applyModel(model=xgb.XGBClassifier(nthread=30, n_jobs=10, 
                                       max_depth=6, learning_rate=0.1,
                                       n_estimators=150, scale_pos_weight=5.4, random_state=42),
               X_train=train["x"], y_train=train["y"], X_test=test["x"]):
    bst = model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(y_pred.mean())
    check_error(y_pred)
    return y_pred

In [46]:
def check_error(preds):
    """check with my best solution"""
    y = []
    with open('out_perfect.cvs') as f:
        for line in f:
            y.append(line.strip().split(',')[1])
    y.remove('mg')
    y = np.array(y)
    error = 0
    for i, x in enumerate(y):
        if int(y[i]) != int(preds[i]):
    #         print(y[i], preds[i])
    #         print("False")
            error += 1
    print(error)

In [34]:
def applyModelSelectFeatures(model=xgb.XGBClassifier(nthread=30, n_jobs=10, min_child_weight=5,
                                                 max_depth=6, learning_rate=0.1, scale_pos_weight=6.0,
                                                 n_estimators=200),
                             X_train=train["x"], y_train=train["y"], X_test=test["x"], num_important=5):
    fitted_train = model.fit(X_train, y_train)
    importance_filter = np.array(fitted_train.feature_importances_).argsort()[-num_important:]
    important_features = np.array(train['features'])[importance_filter]
    print(important_features)
    bst = model.fit(X_train[important_features], y_train)
    y_pred = model.predict(X_test[important_features])
    print(y_pred.mean())
    check_error(y_pred)
    return y_pred
    

In [31]:
y_pred_xgb = applyModelSelectFeatures(num_important=20)

['betam1' 'chi' 'chim2' 'thetapm2' 'zetam2' 'thetapp' 'thetapm1' 'splay'
 'etappm2' 'etappm1' 'betam2' 'thetap' 'etapp1' 'splaym1' 'alpham2'
 'moietyB' 'wtlen' 'moietyP' 'resol' 'chainlen']
0.1433868974042027
354


  if diff:


In [79]:
model=xgb.XGBClassifier(nthread=30, n_jobs=10,max_depth=5, min_child_weight=4, learning_rate=0.1, scale_pos_weight=5,
                      n_estimators=90, random_state=42)
y_pred_xgb = applyModelSelectFeatures(model=model, num_important=10)
# model=xgb.XGBClassifier(nthread=30, n_jobs=10, 
#                         max_depth=4, learning_rate=0.1,
#                         n_estimators=175, scale_pos_weight=6.0, random_state=666)
# y_pred_xgb = applyModelSelectFeatures(model=model)

['thetap1' 'thetap' 'moietyB' 'thetapp' 'thetapm1' 'splaym1' 'etapp1'
 'wtlen' 'resol' 'chainlen']
0.1169344870210136
0


  if diff:


In [51]:
# score= 0.4017

out_file(y_pred_xgb, model_name="xgb")

In [77]:
model=xgb.XGBClassifier(nthread=30, n_jobs=10,max_depth=5, min_child_weight=4, learning_rate=0.09, scale_pos_weight=5,
                          n_estimators=100, random_state=42)
y_pred_xgb3 = applyModelSelectFeatures(model=model, num_important=10)

['thetap1' 'thetapm2' 'thetap' 'moietyB' 'etapp1' 'thetapm1' 'splaym1'
 'wtlen' 'resol' 'chainlen']
0.1369592088998764
129


  if diff:


In [64]:
# score= 0.4017
out_file(y_pred_xgb, model_name="xgb3")

In [95]:
model=xgb.XGBClassifier(nthread=30, n_jobs=10, max_depth=5, min_child_weight=4, 
                        learning_rate=0.1, scale_pos_weight=5.48,
                          n_estimators=90, random_state=430)
y_pred_xgb2 = applyModelSelectFeatures(model=model, num_important=11)

['splay' 'thetapp1' 'etapp1' 'thetapm2' 'moietyB' 'thetap' 'thetapm1'
 'splaym1' 'wtlen' 'resol' 'chainlen']
0.13053152039555005
155


  if diff:


In [96]:

out_file(y_pred_xgb, model_name="xgb2")

In [15]:
import catboost
import importlib
importlib.reload(catboost)

<module 'catboost' from '/home/ulyanin/.local/lib/python3.5/site-packages/catboost/__init__.py'>

In [39]:

catboost.CatBoostClassifier?

In [44]:
model = catboost.CatBoostClassifier(thread_count=30, 
                           learning_rate=0.03, depth=5, scale_pos_weight=5.4,
                           loss_function='Logloss', random_seed=42, iterations=1000, logging_level='Silent')
y_pred_catboost = applyModelSelectFeatures(model=model, num_important=15)
# model = catboost.CatBoostClassifier(thread_count=30, 
#                            learning_rate=0.03, depth=5, scale_pos_weight=5.4,
#                            loss_function='Logloss', )
# y_pred_catboost = applyModelSelectFeatures(model=model)

['atomN2' 'seqC1' 'gamma1' 'atomO2' 'thetap2' 'splay' 'thetap1' 'moietyB'
 'xray' 'thetap' 'wtlen' 'thetapm1' 'moietyR' 'chainlen' 'resol']
0.16019777503090235
342


In [24]:
model = catboost.CatBoostClassifier(thread_count=30, 
                           learning_rate=0.03, depth=5, scale_pos_weight=5.4,
                           loss_function='Logloss')
y_pred_catboost = applyModelSelectFeatures(model=model)

NameError: name 'PY3' is not defined

In [20]:
out_file(y_pred_catboost.astype(np.int), model_name="cat")