In [1]:
import pandas as pd
import dill
import os

model_path = 'app/model/dataset/model.dill'
dir = os.getcwd()

X_train = pd.read_csv(dir + "/aug_train.csv")
# y_train = X_train.pop('target')

X_test = pd.read_csv(dir + "/aug_test.csv")


In [2]:
# from pandas_profiling import ProfileReport
# profile = ProfileReport(X_train, title='Pandas Profiling Report')
# profile.to_file(output_file=f"{dir}/profile.html")


In [3]:
from sklearn.model_selection import train_test_split


def transform(df):

    # drop unnecessary columns
    columns = ['enrollee_id']
    try:
        df.drop(columns=columns, inplace=True)
    except:
        pass

    # fill missing values
    columns = ['enrolled_university', 'education_level',
               'major_discipline', 'experience']
    df[columns] = df[columns].fillna('None')

    # drop columns with missing values
    df.dropna(axis=0, how='any', inplace=True)

    return df


X_train = transform(X_train)
y_train = X_train.pop('target')

X_train, X_validation, y_train, y_validation = train_test_split(
    X_train, y_train, train_size=0.7, random_state=33)


In [4]:
from catboost import CatBoostClassifier

cat_features = ['city', 'gender', 'relevent_experience',
                'enrolled_university', 'education_level', 'major_discipline',
                'experience', 'company_size', 'company_type', 'last_new_job']

cat = CatBoostClassifier(iterations=1000,
                         depth=2,
                         loss_function="Logloss",
                         eval_metric='Accuracy',
                         verbose=False)

cat.fit(X_train, y_train,
        cat_features=cat_features,
        eval_set=(X_validation, y_validation),
        use_best_model=True)

print(f'Model is fitted: {str(cat.is_fitted())}')
print('Model params:', cat.get_params(), '\n')
print('Best iteration: ',cat.best_iteration_, '\n')
print('Best score: ', cat.get_best_score(), '\n')
print('Feature importances:')
[print(i) for i in list(zip(cat.feature_names_, cat.get_feature_importance()))];



Model is fitted: True
Model params: {'iterations': 1000, 'depth': 2, 'loss_function': 'Logloss', 'verbose': False, 'eval_metric': 'Accuracy'} 

Best iteration:  392 

Best score:  {'learn': {'Accuracy': 0.8678665496049166, 'Logloss': 0.343977911548915}, 'validation': {'Accuracy': 0.8689419795221843, 'Logloss': 0.347459018406308}} 

Feature importances:
('city', 6.700479521955547)
('city_development_index', 76.05593717650144)
('gender', 0.8568288514585269)
('relevent_experience', 1.984176513268395)
('enrolled_university', 2.8374511700107767)
('education_level', 0.23077529968763838)
('major_discipline', 0.5937789853156852)
('experience', 2.8619329812161465)
('company_size', 2.179297135717084)
('company_type', 1.787376510671769)
('last_new_job', 2.3333612071723833)
('training_hours', 1.5786046470246917)


In [5]:
transform(X_test).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1131 entries, 3 to 2128
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    1131 non-null   object 
 1   city_development_index  1131 non-null   float64
 2   gender                  1131 non-null   object 
 3   relevent_experience     1131 non-null   object 
 4   enrolled_university     1131 non-null   object 
 5   education_level         1131 non-null   object 
 6   major_discipline        1131 non-null   object 
 7   experience              1131 non-null   object 
 8   company_size            1131 non-null   object 
 9   company_type            1131 non-null   object 
 10  last_new_job            1131 non-null   object 
 11  training_hours          1131 non-null   int64  
dtypes: float64(1), int64(1), object(10)
memory usage: 114.9+ KB


In [6]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1131 entries, 3 to 2128
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    1131 non-null   object 
 1   city_development_index  1131 non-null   float64
 2   gender                  1131 non-null   object 
 3   relevent_experience     1131 non-null   object 
 4   enrolled_university     1131 non-null   object 
 5   education_level         1131 non-null   object 
 6   major_discipline        1131 non-null   object 
 7   experience              1131 non-null   object 
 8   company_size            1131 non-null   object 
 9   company_type            1131 non-null   object 
 10  last_new_job            1131 non-null   object 
 11  training_hours          1131 non-null   int64  
dtypes: float64(1), int64(1), object(10)
memory usage: 114.9+ KB


In [7]:

target = X_test 
target['target'] = pd.Series(cat.predict(X_test))
target.groupby('target').count()


Unnamed: 0_level_0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0.0,514,514,514,514,514,514,514,514,514,514,514,514
1.0,80,80,80,80,80,80,80,80,80,80,80,80


In [8]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1131 entries, 3 to 2128
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    1131 non-null   object 
 1   city_development_index  1131 non-null   float64
 2   gender                  1131 non-null   object 
 3   relevent_experience     1131 non-null   object 
 4   enrolled_university     1131 non-null   object 
 5   education_level         1131 non-null   object 
 6   major_discipline        1131 non-null   object 
 7   experience              1131 non-null   object 
 8   company_size            1131 non-null   object 
 9   company_type            1131 non-null   object 
 10  last_new_job            1131 non-null   object 
 11  training_hours          1131 non-null   int64  
 12  target                  594 non-null    float64
dtypes: float64(2), int64(1), object(10)
memory usage: 123.7+ KB
