In [45]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = 8, 5
plt.rcParams['font.size'] = 12
plt.rcParams['savefig.format'] = 'pdf'
sns.set_style('darkgrid')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
import numpy as np
import pandas as pd
from pathlib import Path

In [47]:
import gdown

if not Path('Students').exists():
    url = "https://drive.google.com/drive/folders/1q7zTd8slQVpSBH3L_MVQO2Nj53QZkBSt"
    gdown.download_folder(url)

In [48]:
data = pd.read_csv('Students/students_train.csv')
print(data.shape)
data.head()

(11470, 24)


Unnamed: 0,ID,Код_группы,Год_Поступления,Пол,Основания,Изучаемый_Язык,Дата_Рождения,Уч_Заведение,Где_Находится_УЗ,Год_Окончания_УЗ,...,Общежитие,Наличие_Матери,Наличие_Отца,Страна_Родители,Опекунство,Село,Иностранец,КодФакультета,СрБаллАттестата,Статус
0,0,20018,2016,Муж,СН,Английский язык,1994-04-12 00:00:00.000,ФГБОУ ВО Алтайский государственный университет,"Россия, Алтайский край, г Барнаул",2016.0,...,0.0,1,1.0,Россия,0.0,1.0,0.0,26.0,45.0,1
1,1,20846,2016,Жен,БН,Английский язык,1996-12-21 00:00:00.000,КГБ ПОУ Международный колледж сыроделия и проф...,"Россия, Алтайский край, г Барнаул",2016.0,...,1.0,1,1.0,Россия,0.0,0.0,0.0,34.0,57.0,0
2,2,14762,2013,Жен,СН,Английский язык,1992-08-01 00:00:00.000,"ФГБОУ ВПО ""Алтайский государственный университет""",Алтайский край г. Барнаул,2013.0,...,0.0,0,0.0,Россия,0.0,0.0,,25.0,43.0,0
3,3,17815,2015,Жен,СН,Английский язык,1994-02-06 00:00:00.000,"КГБОУСПО ""Алтайский строительный техникум """,р.п.Степное озеро,2014.0,...,0.0,1,1.0,Россия,0.0,0.0,0.0,25.0,44.0,1
4,4,17011,2014,Жен,ОО,Английский язык,1997-03-08 00:00:00.000,"МБОУ ""Лицей №3""","Алтайский край, Барнаул г",2014.0,...,0.0,0,0.0,Россия,0.0,0.0,0.0,30.0,56.0,1


In [49]:
from datetime import datetime

data = data.drop(columns=['Код_группы', 'ID'])
data['Дата_Рождения'] = pd.to_datetime(data['Дата_Рождения'])
data['Дата_Рождения'] = (data['Дата_Рождения'] - pd.to_datetime(datetime.today())).dt.days

In [50]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler


categorical = ['Пол', 'Основания', 'Изучаемый_Язык', 'Уч_Заведение', 'Где_Находится_УЗ', 'Пособие', 'Страна_ПП', 'Регион_ПП', 'Город_ПП', 'Общежитие', 'Наличие_Матери', 'Наличие_Отца', 'Страна_Родители', 'Опекунство', 'Село', 'Иностранец', 'КодФакультета']
numeric = ['Год_Поступления', 'Дата_Рождения', 'Год_Окончания_УЗ', 'СрБаллАттестата']

# for nc in numeric:
#     data = data.loc[~data[nc].isna()]
data = data.dropna()

column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('scaling', StandardScaler(), numeric),
    # ('other',  'passthrough', other)
])

target = data.iloc[:, [-1]].to_numpy()
data = column_transformer.fit_transform(data.iloc[:, :-1]).toarray()

In [51]:
data.shape

(9048, 5865)

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, train_size=0.8, random_state=3)

In [53]:
from sklearn.metrics import f1_score

def calc_score(true, pred):
    return max(0, 40 * (f1_score(true, pred) - 0.5))

In [54]:
from catboost import CatBoostClassifier

features = {
    'iterations': 1000,
    'random_seed': 1,
    'eval_metric': 'F1',
    'verbose': 100
}
model = CatBoostClassifier(
    **features
)

In [55]:
features = model.select_features(
    X_train,
    y=y_train,
    eval_set=(X_test, y_test),
    features_for_select=np.arange(X_train.shape[1]),
    num_features_to_select=4000
)

Learning rate set to 0.05167
Step #1 out of 1
0:	learn: 0.5469457	test: 0.5044843	best: 0.5044843 (0)	total: 6.72ms	remaining: 6.72s
100:	learn: 0.7339623	test: 0.6869806	best: 0.6883721 (89)	total: 824ms	remaining: 7.33s
200:	learn: 0.7602582	test: 0.7114338	best: 0.7115560 (184)	total: 1.76s	remaining: 7.01s
300:	learn: 0.7817029	test: 0.7164716	best: 0.7179024 (264)	total: 2.72s	remaining: 6.32s
400:	learn: 0.7895330	test: 0.7202860	best: 0.7202860 (395)	total: 3.64s	remaining: 5.44s
500:	learn: 0.8024194	test: 0.7154326	best: 0.7202860 (395)	total: 4.56s	remaining: 4.54s
600:	learn: 0.8051018	test: 0.7192205	best: 0.7245350 (570)	total: 5.43s	remaining: 3.6s
700:	learn: 0.8117752	test: 0.7197171	best: 0.7245350 (570)	total: 6.36s	remaining: 2.71s
800:	learn: 0.8166259	test: 0.7231041	best: 0.7245350 (570)	total: 7.19s	remaining: 1.79s
900:	learn: 0.8231910	test: 0.7224670	best: 0.7247142 (825)	total: 8.19s	remaining: 900ms
999:	learn: 0.8427756	test: 0.7218310	best: 0.7247142 (825)

In [56]:
selected_features = features['selected_features']

In [57]:
model_features = {
    'iterations': 1000,
    'random_seed': 1,
    'eval_metric': 'F1',
    'verbose': 100
}
new_model = CatBoostClassifier(
    **model_features
)

In [58]:
new_model.fit(X_train[:, selected_features], y_train, eval_set=(X_test[:, selected_features], y_test))
pred = new_model.predict(X_test[:, selected_features])
calc_score(y_test, pred)

Learning rate set to 0.05167
0:	learn: 0.6360065	test: 0.6194369	best: 0.6194369 (0)	total: 4.63ms	remaining: 4.62s
100:	learn: 0.7380730	test: 0.6918123	best: 0.6918123 (100)	total: 439ms	remaining: 3.91s
200:	learn: 0.7670481	test: 0.7133872	best: 0.7133872 (198)	total: 872ms	remaining: 3.46s
300:	learn: 0.7866184	test: 0.7274336	best: 0.7274336 (294)	total: 1.3s	remaining: 3.03s
400:	learn: 0.7988325	test: 0.7288732	best: 0.7323944 (361)	total: 1.73s	remaining: 2.59s
500:	learn: 0.8059969	test: 0.7318777	best: 0.7323944 (361)	total: 2.17s	remaining: 2.16s
600:	learn: 0.8409596	test: 0.7321739	best: 0.7351916 (527)	total: 2.6s	remaining: 1.72s
700:	learn: 0.8478549	test: 0.7398444	best: 0.7409326 (693)	total: 3.03s	remaining: 1.29s
800:	learn: 0.8526756	test: 0.7413793	best: 0.7420190 (783)	total: 3.46s	remaining: 861ms
900:	learn: 0.8563900	test: 0.7429063	best: 0.7429063 (895)	total: 3.9s	remaining: 428ms
999:	learn: 0.8598336	test: 0.7437875	best: 0.7448630 (993)	total: 4.33s	rema

9.794520547945202

In [59]:
test_data = pd.read_csv('Students/students_test.csv')
ids = test_data['ID']
test_data = test_data.drop(columns=['Код_группы', 'ID'])
test_data['Дата_Рождения'] = pd.to_datetime(test_data['Дата_Рождения'])
test_data['Дата_Рождения'] = (test_data['Дата_Рождения'] - pd.to_datetime(datetime.today())).dt.days

test_data = column_transformer.transform(test_data).toarray()

In [60]:
pred = new_model.predict(test_data[:, selected_features])

In [61]:
submit = pd.DataFrame({
    'ID': ids,
    'Статус': pred
})

submit.to_csv('submit.csv', index=False)