In [15]:
import pandas as pd
from sklearn.base import clone
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import KFold
import json

In [2]:
users_df = pd.read_csv('../data/raw/user.csv', index_col='user_id')
decision_df = pd.read_csv('../data/raw/user_decision.csv')
results_df = pd.read_csv('../data/raw/results.csv', index_col='id')
submit_df = pd.read_csv('../data/to_sibmit.csv', index_col='id')

In [3]:
len(decision_df['decision_id'].unique())

343

In [19]:
decision_ids = decision_df['decision_id'].value_counts()[:100].index

In [20]:
for decision_id in decision_ids:
    users_df[str(decision_id)] = users_df.apply(lambda user: ((decision_df['user_id']==user.name)&(decision_df['decision_id']==decision_id)).any(), axis = 1)

In [21]:
users_df.head()

Unnamed: 0_level_0,team_id,game_id,386,25,27,26,276,329,331,330,...,347,9,8,216,231,7,233,346,261,258
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10189,1664,235,True,True,False,False,False,True,True,True,...,False,False,False,False,False,False,False,False,False,False
10018,1690,237,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10580,1394,194,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10600,1908,262,True,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10112,1592,219,False,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False


In [22]:
users_known_df = results_df[[]].merge(users_df, left_index=True, right_index=True)
users_test_df = submit_df[[]].merge(users_df, left_index=True, right_index=True)

In [23]:
columns = ['Analytical thinking', 'Systemic thinking', 'Adaptability', 'Focus']
kf = KFold(n_splits=4, shuffle=True, random_state=42)
clf = RandomForestClassifier()
recall = 0
to_submit_df = submit_df[[]].copy()

for column in columns:
    for train_index, valid_index in kf.split(users_known_df):
        X_train, X_valid = users_known_df.iloc[train_index,:], users_known_df.iloc[valid_index,:]
        y_train, y_valid = results_df.iloc[train_index,:][column], results_df.iloc[valid_index,:][column]
        model = clone(clf).fit(X_train, y_train)
        pred = model.predict(X_valid)
        recall += recall_score(y_valid, pred, labels=[1.0,2.0,3.0,4.0,5.0,6.0], average='micro')
    model_full = clone(clf).fit(users_known_df, results_df[column])
    to_submit_df[column] = model_full.predict(users_test_df)
    print(f'Model for column {column}: {model.__dict__}')

print(recall/32.)
to_submit_df.to_csv('../data/submission/random_forest.csv')

Model for column Analytical thinking: {'base_estimator': DecisionTreeClassifier(), 'n_estimators': 100, 'estimator_params': ('criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'random_state', 'ccp_alpha'), 'bootstrap': True, 'oob_score': False, 'n_jobs': None, 'random_state': None, 'verbose': 0, 'warm_start': False, 'class_weight': None, 'max_samples': None, 'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.0, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'ccp_alpha': 0.0, 'feature_names_in_': array(['team_id', 'game_id', '386', '25', '27', '26', '276', '329', '331',
       '330', '332', '392', '406', '333', '407', '405', '278', '418',
       '66', '425', '427', '277', '426', '419', '92', '67', '420', '101',
       '320', '324', '327', '74', '319', '102', '68', '81', '393', '323',
      