In [1]:
import numpy as np
import pandas as pd 
import matplotlib as plt

# 데이터 불러오기 및 확인

In [2]:
train_data = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')

print("\n---Test data Overview---\n")
print(test_data.info)
print("\n---Train data Overview---\n")
print(train_data.info)


---Test data Overview---

<bound method DataFrame.info of          id  Time_spent_Alone Stage_fear  Social_event_attendance  \
0     18524               3.0         No                      7.0   
1     18525               NaN        Yes                      0.0   
2     18526               3.0         No                      5.0   
3     18527               3.0         No                      4.0   
4     18528               9.0        Yes                      1.0   
...     ...               ...        ...                      ...   
6170  24694               3.0         No                      5.0   
6171  24695               8.0        Yes                      2.0   
6172  24696               2.0         No                      4.0   
6173  24697               3.0         No                      4.0   
6174  24698               NaN        Yes                      1.0   

      Going_outside Drained_after_socializing  Friends_circle_size  \
0               4.0                       

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


### **결측치 확인**

In [3]:
null_count = train_data.isnull().sum()
row_count = train_data.shape[0]

print("\n--- 각 컬럼별 결측치 개수 ---")
print(null_count)
print("\n--- 데이터프레임의 총 행 개수 ---")
print(row_count)
print("\n--- 컬럼별 결측치 비율 ---")
print(null_count/row_count)



--- 각 컬럼별 결측치 개수 ---
id                              0
Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
Personality                     0
dtype: int64

--- 데이터프레임의 총 행 개수 ---
18524

--- 컬럼별 결측치 비율 ---
id                           0.000000
Time_spent_Alone             0.064241
Stage_fear                   0.102192
Social_event_attendance      0.063701
Going_outside                0.079141
Drained_after_socializing    0.062028
Friends_circle_size          0.056899
Post_frequency               0.068236
Personality                  0.000000
dtype: float64


### **train data에서 id 제거**

In [4]:
train_data = train_data.iloc[:,1:]
print("columns:",train_data.columns.tolist())

columns: ['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance', 'Going_outside', 'Drained_after_socializing', 'Friends_circle_size', 'Post_frequency', 'Personality']


# **KNN Classifier 모델 구축**

## **train data 분할하기**

In [5]:
from sklearn.model_selection import train_test_split

#categorical variables one-hot encoding
train_data = pd.get_dummies(train_data, drop_first=True)
print("columns: ", train_data.columns.tolist())

#spilt train data as train and validation
X = train_data[['Time_spent_Alone','Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency', 'Stage_fear_Yes', 'Drained_after_socializing_Yes']]
y = train_data["Personality_Introvert"]
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.3, random_state = 42)

columns:  ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency', 'Stage_fear_Yes', 'Drained_after_socializing_Yes', 'Personality_Introvert']


## **파이프라인 구축(하이퍼라미터 튜닝,모델학습)**

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.model_selection import GridSearchCV

In [7]:
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("imputer",IterativeImputer(random_state=42)),
    ("knn",KNeighborsClassifier())
])

param_grid=[{
    "scaler": [StandardScaler(),None],
    "imputer__estimator": [LinearRegression(), BayesianRidge()],
    "imputer__max_iter":[50, 200],
    "imputer__tol": [1e-3, 5e-3, 1e-2],
    "knn__n_neighbors":[9,11,13,15]
}]

search = GridSearchCV(pipe, param_grid, cv=5, scoring = 'accuracy', n_jobs=-1)
search.fit(X_train, y_train)

print("-"*90)
print("Best parameters found :\n",search.best_params_)
print("Best cross validation accuracy :", search.best_score_)


------------------------------------------------------------------------------------------
Best parameters found :
 {'imputer__estimator': LinearRegression(), 'imputer__max_iter': 50, 'imputer__tol': 0.001, 'knn__n_neighbors': 11, 'scaler': None}
Best cross validation accuracy : 0.9693815060475076


##  **검증데이터로 정확도 테스트**

In [8]:
#최종 파이프라인 확인
final_pipeline = search.best_estimator_
print(final_pipeline)

Pipeline(steps=[('scaler', None),
                ('imputer',
                 IterativeImputer(estimator=LinearRegression(), max_iter=50,
                                  random_state=42)),
                ('knn', KNeighborsClassifier(n_neighbors=11))])


In [9]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


y_pred = final_pipeline.predict(X_val)

print("Accuracy:",accuracy_score(y_val,y_pred))
print("classification report\n",classification_report(y_val,y_pred))
print("Confusion Matrix\n",confusion_matrix(y_val,y_pred))
      

Accuracy: 0.967434328895286
classification report
               precision    recall  f1-score   support

       False       0.98      0.98      0.98      4115
        True       0.94      0.93      0.94      1443

    accuracy                           0.97      5558
   macro avg       0.96      0.96      0.96      5558
weighted avg       0.97      0.97      0.97      5558

Confusion Matrix
 [[4036   79]
 [ 102 1341]]


## **테스트 데이터로 예측**

In [10]:
test_ids = test_data['id']
test_data = test_data.iloc[:,1:]

In [11]:
test_data = pd.get_dummies(test_data,drop_first=True)
print(test_data.columns.tolist())

['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size', 'Post_frequency', 'Stage_fear_Yes', 'Drained_after_socializing_Yes']


In [12]:
final_prediction = final_pipeline.predict(test_data)
pred_map = {1: 'Introvert',0:'Extrovert'}
final_pred_str = [pred_map[p] for p in final_prediction]

# submission file 만들기

In [13]:
submission_df = pd.DataFrame({'id':test_ids,'Personality':final_pred_str})
print(submission_df.head(10))
submission_df.to_csv('submission.csv',index=False)

      id Personality
0  18524   Extrovert
1  18525   Introvert
2  18526   Extrovert
3  18527   Extrovert
4  18528   Introvert
5  18529   Extrovert
6  18530   Extrovert
7  18531   Introvert
8  18532   Extrovert
9  18533   Introvert
