In [19]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn. preprocessing import OneHotEncoder
from sklearn. preprocessing import PowerTransformer
from catboost import CatBoostClassifier
from sklearn import metrics

In [20]:
train_df = pd.read_csv('../../data/raw/train.csv', sep=',')
test_df = pd.read_csv('../../data/raw/test.csv', sep=',')
test_df['id'] = np.nan

In [3]:
def pred_obrabotka(df):
  df.loc[df['capillary_refill_time'] == '3', 'capillary_refill_time'] = np.nan
  df.loc[(df['pain'] == 'slight') | (df['pain'] == 'moderate'), 'pain'] = 'mild_pain'
  #df.loc[df['pain'] == 'None', 'pain'] = 'extreme_pain'
  df.loc[df['peristalsis'] == 'distend_small', 'peristalsis'] = np.nan
  df.loc[(df['abdominal_distention'] == 'none') | (df['abdominal_distention'] == 'None'), 'abdominal_distention'] = 'absent'
  df.loc[(df['nasogastric_tube'] == 'none') | (df['nasogastric_tube'] == 'None'), 'nasogastric_tube'] = 'absent'
  df.loc[(df['nasogastric_reflux'] == 'none') | (df['nasogastric_reflux'] == 'None'), 'nasogastric_reflux'] = 'absent'
  df.loc[df['nasogastric_reflux'] == 'slight', 'nasogastric_reflux'] = np.nan
  df.loc[df['rectal_exam_feces'] == 'None', 'rectal_exam_feces'] = 'absent'
  df.loc[df['rectal_exam_feces'] == 'serosanguious', 'rectal_exam_feces'] = np.nan
  df.loc[df['abdomen'] == 'None', 'abdomen'] = 'distend_small'
  df.loc[df['abdomo_appearance'] == 'None', 'abdomo_appearance'] = np.nan
  #df['hospital_number'] = df['hospital_number'].apply ( lambda x: str(x)[:3])

  return df

In [4]:
def new_priznak(df):
  df['protein'] = np.where((df['total_protein'] < 30), 'normal', 'many') # Разбиваем по количеству белка

  df["deviation_from_normal_temp"] = df["rectal_temp"].apply(lambda x: abs(x - 37.8))

  df['surgery_required'] = np.where((df['pain'] == 'extreme_pain') | (df['abdominal_distention'] == 'severe'), 'yes', 'no') # Добавим признак необходимости операции

  df['septicemia'] = np.where(df['mucous_membrane'] == 'bright_red', 'yes', 'no') # Добавим признак наличия септицемии

  df['intestinal_damage'] = np.where((df['abdomo_appearance'] == 2) |
                                     (df['abdomo_appearance'] == 3), 'yes', 'no') # Заменим признак Внешний вид живота (abdomo_appearance) на поражение кишечника

  df['nasogastric_reflux_ph'] = np.where((df['nasogastric_reflux_ph'] <= 2), 'normal', 'many') # nasogastric_reflux_ph

  df.drop(['id', 'lesion_2', 'lesion_3', 'rectal_temp'], axis= 1, inplace = True) # Убираем неликвидные признаки

  return df


In [5]:
def not_nan(df):

  # Заменим пропуски в категориальных данных предыдущими значением
  category_columns = list(df.select_dtypes(include=[object]).columns)
  for column in category_columns:
    df[column].fillna(method='ffill', inplace=True)

  # Заменим пропуски в числовых данных
  num_columns = list((df.select_dtypes(include=[int, float]).columns))
  for column in num_columns:
    df[column].fillna(df[column].mean(), inplace=True)

  return df

In [10]:
def work_fith_data(df, key):

  # Применим степенное преобразование для числовых данных
  num_columns = list((df.select_dtypes(include=[int, float]).columns))
  pwt = PowerTransformer()
  num_df = pd.DataFrame(pwt.fit_transform(df[num_columns]))
  num_df.columns = num_columns

  category_columns = list((df.select_dtypes(include=[object]).columns))
  if key == 0: # One hot encoding для категориальных данных
    ohe = OneHotEncoder(handle_unknown='ignore')
    cat_df = pd.DataFrame(ohe.fit_transform(df[category_columns]).toarray())
  elif key == 1: # Ordinal encoding для категориальных данных
    ore = OrdinalEncoder()
    cat_df = pd.DataFrame(ore.fit_transform(df[category_columns]))
    cat_df.columns = category_columns

  return num_df.join(cat_df)

In [21]:
train_df = pred_obrabotka(train_df)
test_df = pred_obrabotka(test_df)
train_df = new_priznak(train_df)
test_df = new_priznak(test_df)
train_df = not_nan(train_df)
test_df = not_nan(test_df)
train_df = work_fith_data(train_df, 1) # датафрейм для обучения
test_df = work_fith_data(test_df, 1)

  df[column].fillna(method='ffill', inplace=True)
  df[column].fillna(method='ffill', inplace=True)


In [22]:
X = train_df.drop("outcome", axis = 1)
X_test = test_df.drop("outcome", axis = 1)
y = train_df['outcome']
y_test = test_df['outcome']

In [23]:
cbc = CatBoostClassifier(verbose=0, n_estimators=300)
cbc.fit(X, y)
y_pred = cbc.predict(X_test)

print("Градиентный бустинг: ", metrics.accuracy_score(y_test, y_pred))

Градиентный бустинг:  0.7391304347826086


In [24]:
X

Unnamed: 0,hospital_number,pulse,respiratory_rate,packed_cell_volume,total_protein,abdomo_protein,lesion_1,deviation_from_normal_temp,surgery,age,...,nasogastric_reflux_ph,rectal_exam_feces,abdomen,abdomo_appearance,surgical_lesion,cp_data,protein,surgery_required,septicemia,intestinal_damage
0,-5.551115e-17,1.562170,-0.159673,0.760949,-0.023389,0.255352,-0.143554,-0.626233,1.0,0.0,...,0.0,1.0,1.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0
1,0.000000e+00,0.464584,-1.586035,-1.809428,1.670862,-0.815346,-0.143924,-0.626233,1.0,0.0,...,1.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0
2,-5.551115e-17,1.306175,0.145517,-1.278172,-0.721301,0.255352,0.649808,-0.056721,1.0,0.0,...,0.0,0.0,0.0,2.0,1.0,0.0,1.0,1.0,0.0,0.0
3,5.551115e-16,-0.086662,0.280510,0.414187,-0.479106,0.553805,-0.143924,0.376846,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
4,5.551115e-16,-0.991131,1.172541,-0.155436,-0.372874,-0.303949,-2.405315,-0.983485,0.0,0.0,...,0.0,3.0,3.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1230,0.000000e+00,1.500537,1.172541,0.760949,1.676873,-0.815346,-0.144664,0.376846,1.0,0.0,...,1.0,1.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0
1231,-5.551115e-17,-0.591859,1.247687,-1.536682,-0.721301,0.378743,-0.143554,-0.626233,1.0,0.0,...,0.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0
1232,-5.551115e-17,0.337253,0.832324,-0.914019,-0.960194,1.899718,-1.206855,-0.626233,1.0,1.0,...,0.0,2.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
1233,0.000000e+00,-0.164456,-0.984353,0.844033,1.697307,-0.815346,-0.143554,-0.626233,1.0,0.0,...,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0


In [20]:
train_df.to_csv('../../data/baselines/train.csv', sep=',', index = False)
test_df.to_csv('../../data/baselines/test.csv', sep=',', index = False)