In [126]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

In [127]:
train_df = pd.read_parquet('train.parquet')
test_df = pd.read_parquet('test.parquet')

In [128]:
def get_features(df):
    df['mean_value'] = df['values'].apply(np.mean)
    df['std_value'] = df['values'].apply(np.std)
    df['max_value'] = df['values'].apply(np.max)
    df['min_value'] = df['values'].apply(np.min)
    return df

In [129]:
train = get_features(train_df)
test = get_features(test_df)

In [130]:
train.isnull().sum()

id             0
dates          0
values         0
label          0
mean_value    88
std_value     88
max_value     88
min_value     88
dtype: int64

In [131]:
train = train.dropna()
test = test.dropna()

In [132]:
X = train.drop(columns=['id', 'dates', 'values','label'], axis=1)
y = train['label']

In [133]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [134]:
clf_rf = RandomForestClassifier()

In [135]:
parametrs = {'n_estimators':[10,20,30],
            'max_depth': [2,5,6,10]}

In [136]:
grid_search_cv_clf = GridSearchCV(clf_rf, parametrs, cv =5)

In [137]:
grid_search_cv_clf.fit(X_train,y_train)

In [138]:
grid_search_cv_clf.best_params_

{'max_depth': 10, 'n_estimators': 30}

In [139]:
best_clf = grid_search_cv_clf.best_estimator_

In [140]:
feature_importances = best_clf.feature_importances_
feature_importances_df = pd.DataFrame({'features': list(X_train),
                                       'feature_importances': feature_importances})
feature_importances_df.sort_values('feature_importances', ascending = False)

Unnamed: 0,features,feature_importances
0,mean_value,0.767572
3,min_value,0.088075
1,std_value,0.081658
2,max_value,0.062694


In [141]:
best_clf.score(X_val, y_val)

0.7897766376775324

In [142]:
test.isnull().sum()

id            0
dates         0
values        0
mean_value    0
std_value     0
max_value     0
min_value     0
dtype: int64

In [143]:
test = test.dropna()

In [144]:
X_for_pred = test.drop(columns=['id', 'dates', 'values'], axis=1)
y_pred = best_clf.predict_proba(X_for_pred)[:, 1]

# Сохранение предсказаний
submission = pd.DataFrame({
    'prediction': y_pred
})
submission.to_csv('submission.csv', index=False)