In [1]:
import pkg_resources

ls_libs = ["numpy", "pandas", "scipy", "scikit-learn"]
for n_lib in ls_libs:
    vak_ver = pkg_resources.get_distribution(n_lib).version
    print(f"{n_lib}: {vak_ver}")

numpy: 2.2.3
pandas: 2.2.3
scipy: 1.15.2
scikit-learn: 1.6.1


In [2]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

### 전처리

In [3]:
df = pd.read_csv("set_06_data.csv")
df.head(2)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevant_exp,enrolled_university,edu_level,major_discipline,exp,company_size,company_type,last_new_job,training_hours,target,Xgrp
0,8949,city_103,0.92,Male,Has,no_enrollment,Graduate,STEM,>20,,,1,36.0,1,train
1,29725,city_40,0.776,Male,No,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47.0,0,train


In [4]:
df_s1 = df.drop(columns = ["city", "company_size", "company_type"])

In [None]:
pd.DataFrame(dict(t = df_s1.dtypes, c = df_s1.isna().sum()))

In [9]:
df_s2 = df_s1.dropna()

In [None]:
df_s3 = df_s2.loc[~df_s2["exp"].isin([">20", "<1"]), ]
df_s3["exp"] = df_s3["exp"].astype("int")

In [None]:
df_s4 = df_s3.loc[~df_s3["last_new_job"].isin([">4", "never"]), ]
df_s4["last_new_job"] = df_s4["last_new_job"].astype("int")

In [13]:
df_base = df_s4.reset_index(drop = True)

In [14]:
len(df_base)

7522

### Q1.

In [None]:
df_q1 = df_base.loc[df_base["edu_level"].isin(["Masters", "Phd"]), ["relevant_exp", "target"]]
df_q1.head(2)

In [None]:
df_q1_A = df_q1.loc[df_q1["relevant_exp"] == "No", ]
df_q1_B = df_q1.loc[df_q1["relevant_exp"] == "Has", ]
len(df_q1_A), len(df_q1_B)

In [None]:
stat_A = df_q1_A["target"].value_counts(normalize = True)[1]
stat_B = df_q1_B["target"].value_counts(normalize = True)[1]
stat_A, stat_B

In [None]:
round(stat_A / stat_B, 2)

In [25]:
df_q1.groupby("relevant_exp")["target"].mean()

relevant_exp
Has    0.177343
No     0.281421
Name: target, dtype: float64

### Q2.

In [34]:
df_q2_obj = df_base.iloc[:, [2, 3, 4, 5, 6]]
df_q2_obj.head(2)

Unnamed: 0,gender,relevant_exp,enrolled_university,edu_level,major_discipline
0,Male,Has,no_enrollment,Graduate,STEM
1,Male,Has,no_enrollment,Graduate,STEM


In [35]:
# df_q2_dum = pd.get_dummies(df_q2_obj, columns = df_q2_obj.columns.to_list()) # 시험버전
df_q2_dum = pd.get_dummies(df_q2_obj, columns = df_q2_obj.columns.to_list(), dtype = "int") # 최신버전

In [None]:
df_q2_dum.columns[[2, 4, 7, 10, 16]]

In [41]:
# pd.DataFrame(dict(c = df_q2_dum.columns))

In [None]:
df_q2_s2 = df_q2_dum.drop(columns = df_q2_dum.columns[[2, 4, 7, 10, 16]])
df_q2_s2.head(1)

In [50]:
ls_col_x = ["city_development_index", "exp", "last_new_job", "training_hours"]
df_job2 = pd.concat([df_base[["target", "Xgrp"] + ls_col_x], df_q2_s2], axis = 1)
df_job2.head(1)

Unnamed: 0,target,Xgrp,city_development_index,exp,last_new_job,training_hours,gender_Female,gender_Male,relevant_exp_Has,enrolled_university_Full time course,enrolled_university_Part time course,edu_level_Graduate,edu_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,1,train,0.92,7,1,46.0,0,1,1,0,0,1,0,0,0,0,0,0


In [None]:
df_job2.shape

In [None]:
model_lr = LogisticRegression(C = 100000, max_iter = 1000, solver = "liblinear",
                              random_state = 1234, fit_intercept = True)
model_lr.fit(X = df_job2.drop(columns = ["target", "Xgrp"]),
             y = df_job2["target"])

In [None]:
model_lr.coef_ # 회귀계수

In [None]:
np.exp(model_lr.coef_) # 승산비(OR, Odds Ratio)

In [61]:
np.exp(model_lr.coef_).max() # 1.67!!!

np.float64(1.6738948061770285)

In [58]:
df_or = pd.DataFrame(np.exp(model_lr.coef_), columns = df_job2.columns[2:])
df_or

Unnamed: 0,city_development_index,exp,last_new_job,training_hours,gender_Female,gender_Male,relevant_exp_Has,enrolled_university_Full time course,enrolled_university_Part time course,edu_level_Graduate,edu_level_Masters,major_discipline_Arts,major_discipline_Business Degree,major_discipline_Humanities,major_discipline_No Major,major_discipline_Other
0,0.002116,0.971941,1.099657,0.999068,0.842418,0.870245,0.463938,1.673895,0.75246,1.296251,0.95707,1.337853,1.124161,1.279196,1.502104,0.64428


### Q3. 

In [63]:
df_train = df_job2.loc[df_job2["Xgrp"] == "train", ]
df_test  = df_job2.loc[df_job2["Xgrp"] == "test", ]
len(df_train), len(df_test) # 단계 1

(4706, 2816)

In [None]:
model_knn = KNeighborsClassifier(n_neighbors = 3)
model_knn.fit(X = df_train.drop(columns = ["target", "Xgrp"]),
              y = df_train["target"])
pred = model_knn.predict(df_test.drop(columns = ["target", "Xgrp"]))
pred[:4] # 단계 3

In [71]:
y_t = df_test["target"]
y_p = pred

In [None]:
from sklearn.metrics import accuracy_score
round(accuracy_score(y_true = y_t, y_pred = y_p), 2)

In [None]:
df_tab = pd.crosstab(y_t, y_p)
df_tab

In [None]:
round((df_tab[0][0] + df_tab[1][1]) / df_tab.sum().sum(), 2)

In [87]:
round(df_tab.values.diagonal().sum() / df_tab.values.sum(), 2) # 몰라도 됨.

np.float64(0.69)