In [1]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import pandas_profiling
import random
import os
seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
drop_columns = ['index', 'enrollee_id']
train = train.drop(drop_columns, axis=1)
test = test.drop(drop_columns, axis=1)

In [5]:
def _fillna(train:pd.DataFrame, test:pd.DataFrame, column, value):
    train[column] = train[column].fillna(value)
    test[column] = test[column].fillna(value)

In [6]:
_fillna(train, test, 'company_size', '-1')
_fillna(train, test, 'company_type', '-1')
_fillna(train, test, 'last_new_job', '-1')

In [2]:
def label_encode(train:pd.DataFrame, test:pd.DataFrame, columns:list):
    le = LabelEncoder()
    
    for column in columns:
        labels = pd.concat([train[column], test[column]]) .drop_duplicates()
        le.fit(labels)
        train[column] = le.transform(train[column])
        test[column] = le.transform(test[column])

In [7]:
label_encode(train, test, ['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level', 'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job'])

In [8]:
y = train['target'].astype('int64')
train = train.drop('target', axis=1)

In [11]:
model = RandomForestClassifier(random_state=1)
model.fit(train, y)
res = model.predict(test)
submit = pd.read_csv('sample_submit.csv')
submit['target'] = res

In [12]:
submit.to_csv('submit.csv', index=False)