In [307]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import pandas_profiling
import random
import os
seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [308]:
def label_encode(train:pd.DataFrame, test:pd.DataFrame, columns:list):
    le = LabelEncoder()
    
    for column in columns:
        labels = pd.concat([train[column], test[column]]) .drop_duplicates()
        le.fit(labels)
        train[column] = le.transform(train[column])
        test[column] = le.transform(test[column])

In [309]:
def _classification(x, y, result:list):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)
    
    for model in [KNeighborsClassifier(), KNeighborsClassifier(weights='distance'), LogisticRegression(solver='newton-cg'), LinearDiscriminantAnalysis(), DecisionTreeClassifier(), RandomForestClassifier()]:
        model.fit(x_train, y_train)
        res = model.predict(x_test)
        result.append({'model': str(model), 'score': accuracy_score(res, y_test)})

In [310]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [311]:
drop_columns = ['gender', 'enrollee_id', 'major_discipline', 'company_size', 'company_type']
train = train.drop(drop_columns, axis=1)
test = test.drop(drop_columns, axis=1)

In [312]:
def _replace(train:pd.DataFrame, test:pd.DataFrame, column, condition, value):
    train.loc[train[train[column] == condition].index, column] = value
    test.loc[test[test[column] == condition].index, column] = value

In [313]:
def _fillna(train:pd.DataFrame, test:pd.DataFrame, column, value):
    train[column] = train[column].fillna(value)
    test[column] = test[column].fillna(value)

In [314]:
train.enrolled_university.unique()

array(['no_enrollment', nan, 'Part time course', 'Full time course'],
      dtype=object)

In [315]:
_fillna(train, test, 'enrolled_university', '-1')
_fillna(train, test, 'education_level', '-1')
_fillna(train, test, 'experience', '-1')
_fillna(train, test, 'last_new_job', '-1')

In [316]:
_replace(train, test, 'last_new_job', '>4', '5')
_replace(train, test, 'last_new_job', 'never', '0')
_replace(train, test, 'experience', '>20', '21')
_replace(train, test, 'experience', '<1', '0')


In [317]:
label_encode(train, test, ['city', 'relevent_experience', 'enrolled_university', 'education_level'])

In [318]:
y = train['target']
y = y.astype('int64')
train = train.drop('target', axis=1)

In [319]:
result = []
_classification(train, y, result)
print(pd.DataFrame(result))



                                      model     score
0                    KNeighborsClassifier()  0.693738
1  KNeighborsClassifier(weights='distance')  0.694716
2    LogisticRegression(solver='newton-cg')  0.755055
3              LinearDiscriminantAnalysis()  0.750489
4                  DecisionTreeClassifier()  0.686888
5                  RandomForestClassifier()  0.759295


In [320]:
model = RandomForestClassifier()
model.fit(train, y)
res = model.predict(test)
submit = pd.read_csv('sample_submit.csv')
submit['target'] = res

In [321]:
submit.to_csv('submit.csv', index=False)