## 프로그래머스 채용공고 추천 (ML) 
- https://school.programmers.co.kr/skill_check_assignments/1

### 1. 파일 압축해제

In [None]:
import zipfile
import os

def unzip_file(zip_path):
    # 현재 작업 디렉토리
    current_directory = os.getcwd()

    # zip 파일명에서 확장자를 제외한 부분을 폴더명으로 사용
    folder_name = os.path.splitext(os.path.basename(zip_path))[0]

    # 압축을 해제할 폴더 경로
    extract_path = os.path.join(current_directory, folder_name)

    # 폴더 생성 및 압축 해제
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

# 예제 사용
zip_file_path = 'train_job.zip'  # 압축 해제할 zip 파일 경로

# 압축 해제 수행
unzip_file(zip_file_path)


### 2. 데이터 불러오기

In [142]:
# 데이터 불러오기
import pandas as pd
from sklearn.preprocessing import LabelEncoder
job_companies = pd.read_csv("train_job/job_companies.csv")
job_companies = job_companies.dropna()
job_tags = pd.read_csv("train_job/job_tags.csv")

tags = pd.read_csv("train_job/tags.csv")
train = pd.read_csv("train_job/train.csv")
user_tags = pd.read_csv("train_job/user_tags.csv")
user_tags = user_tags.drop_duplicates()
test_job = pd.read_csv("test_job.csv")


### 3. 전처리, join, 인코딩

In [144]:
# tag들만 따로 인코딩
le = LabelEncoder()
tags['tagID'] = le.fit_transform(tags['tagID'])
user_tags['tagID'] = le.transform(user_tags['tagID'])
job_tags['tagID'] = le.transform(job_tags['tagID'])

In [145]:
# 테이블 join
job_tags = pd.merge(job_tags, tags, on='tagID', how='inner')
user_tags = pd.merge(user_tags, tags, on='tagID', how='inner')
job_tags.rename(columns = {'tagID' : 'job_tagID'}, inplace = True)
job_tags = job_tags.groupby('jobID')['job_tagID'].apply(list).reset_index()
user_tags.rename(columns = {'tagID' : 'user_tagID'}, inplace = True)
user_tags = user_tags.groupby('userID')['user_tagID'].apply(list).reset_index()

In [146]:
# 테이블 merge
train_df1 = pd.merge(train, job_tags, on='jobID', how='inner')
train_df2 = pd.merge(train_df1, user_tags, on='userID', how='inner')
train_df3 = pd.merge(train_df2, job_companies, on='jobID', how='left')

test_df1 = pd.merge(test_job, job_tags, on='jobID', how='inner')
test_df2 = pd.merge(test_df1, user_tags, on='userID', how='inner')
test_df3 = pd.merge(test_df2, job_companies, on='jobID', how='left')

In [153]:
# label_encode_column 함수 
def label_encode_column(column):
    le = LabelEncoder()
    return le.fit_transform(column)

# 인코딩 컬럼 선정
columns_to_encode = ['companyID','companySize']  

# 선택한 칼럼만 인코딩
train_df3[columns_to_encode] = train_df3[columns_to_encode].apply(label_encode_column)
test_df3[columns_to_encode] = test_df3[columns_to_encode].apply(label_encode_column)

In [198]:
mlb = MultiLabelBinarizer()
df = train_df3

# MultiLabelBinarizer를 사용하여 job_tagID를 이진 벡터로 변환
mlb = MultiLabelBinarizer()
train_job_tagID_encoded = pd.DataFrame(mlb.fit_transform(train_df3['job_tagID']), columns=mlb.classes_, index=train_df3.index)
train_user_tagID_encoded = pd.DataFrame(mlb.fit_transform(train_df3['user_tagID']), columns=mlb.classes_, index=train_df3.index)
# 기존의 job_tagID 열 제거
train1 = df.drop(['job_tagID','user_tagID'], axis=1)
train2 = df.drop(['job_tagID','user_tagID'], axis=1)
# 인코딩된 열 추가
train1 = pd.concat([train1, train_job_tagID_encoded], axis=1)
train2 = pd.concat([train2, train_user_tagID_encoded], axis=1)
# 필요하다면 NaN 값을 다른 값으로 대체하거나 처리할 수 있습니다.
train2 = train2.fillna(0)

In [204]:
mlb = MultiLabelBinarizer()
df = test_df3

# MultiLabelBinarizer를 사용하여 job_tagID를 이진 벡터로 변환
mlb = MultiLabelBinarizer()
test_job_tagID_encoded = pd.DataFrame(mlb.fit_transform(test_df3['job_tagID']), columns=mlb.classes_, index=test_df3.index)
test_user_tagID_encoded = pd.DataFrame(mlb.fit_transform(test_df3['user_tagID']), columns=mlb.classes_, index=test_df3.index)
# 기존의 job_tagID 열 제거
test1 = df.drop(['job_tagID','user_tagID'], axis=1)
test2 = df.drop(['job_tagID','user_tagID'], axis=1)
# 인코딩된 열 추가
test1 = pd.concat([test1, test_job_tagID_encoded], axis=1)
test2 = pd.concat([test2, test_user_tagID_encoded], axis=1)
# 필요하다면 NaN 값을 다른 값으로 대체하거나 처리할 수 있습니다.
test2 = test2.fillna(0)

train2.columns = train2.columns.astype(str)
test2.columns = test2.columns.astype(str)


In [205]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier, Pool, cv
X = train2.drop(['userID','jobID',"applied"],axis=1)
y = train2['applied']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4800, 347), (1200, 347), (4800,), (1200,))

In [206]:
# !pip install xgboost
# 여러 모델 정의
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

models = [
    ('Logistic Regression', LogisticRegression()),
    ('Random Forest', RandomForestClassifier()),
    ('CatBoost', CatBoostClassifier(verbose=0))  # verbose=0은 출력을 감춥니다.
]
# 각 모델에 대해 훈련 및 예측 수행
for model_name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # 정확도 계산
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{model_name}: Accuracy = {accuracy:.4f}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression: Accuracy = 0.8633
Random Forest: Accuracy = 0.8008
CatBoost: Accuracy = 0.8650


In [209]:
test_job['applied'] = model.predict(test2.drop(['userID','jobID'],axis=1))

In [215]:
test_job.to_csv("sub.csv",index=False)