# 20기 KNN 정규세션 과제

**데이터:** [blackfriday | Kaggle](https://www.kaggle.com/llopesolivei/blackfriday)

---

## 0. 데이터 불러오기

In [27]:
import warnings
warnings.filterwarnings("ignore") 

In [28]:
import pandas as pd
df = pd.read_csv("blackfriday.csv", index_col = 0)
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1001088,P00046042,F,0-17,10,A,3,0,5,17.0,,2010
1,1004493,P00347742,F,0-17,10,A,1,0,7,,,4483
2,1005302,P00048942,F,0-17,10,A,1,0,1,4.0,,7696
3,1001348,P00145242,F,0-17,10,A,3,0,2,4.0,,16429
4,1001348,P00106742,F,0-17,10,A,3,0,3,5.0,,5780


## 1. Preprocssing

In [29]:
df.isna().sum()   # NAN values are only in Product_Category_2 & Product_Category_3

User_ID                          0
Product_ID                       0
Gender                           0
Age                              0
Occupation                       0
City_Category                    0
Stay_In_Current_City_Years       0
Marital_Status                   0
Product_Category_1               0
Product_Category_2            1533
Product_Category_3            3454
Purchase                         0
dtype: int64

In [30]:
# Fill Product_Category_2 & Product_Category_3 using Simple Imputer
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='most_frequent')
df[['Product_Category_2','Product_Category_3']] = imputer.fit_transform(df[['Product_Category_2','Product_Category_3']])

In [31]:
df.isnull().sum()

User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
Purchase                      0
dtype: int64

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4998 entries, 0 to 4997
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   User_ID                     4998 non-null   int64  
 1   Product_ID                  4998 non-null   object 
 2   Gender                      4998 non-null   object 
 3   Age                         4998 non-null   object 
 4   Occupation                  4998 non-null   int64  
 5   City_Category               4998 non-null   object 
 6   Stay_In_Current_City_Years  4998 non-null   object 
 7   Marital_Status              4998 non-null   int64  
 8   Product_Category_1          4998 non-null   int64  
 9   Product_Category_2          4998 non-null   float64
 10  Product_Category_3          4998 non-null   float64
 11  Purchase                    4998 non-null   int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 507.6+ KB


## 2. KNN 구현

In [33]:
df_knn = df[['Occupation','Marital_Status','Product_Category_1','Product_Category_2','Product_Category_3','Purchase','Gender']]
df_knn.head()

Unnamed: 0,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Gender
0,10,0,5,17.0,16.0,2010,F
1,10,0,7,8.0,16.0,4483,F
2,10,0,1,4.0,16.0,7696,F
3,10,0,2,4.0,16.0,16429,F
4,10,0,3,5.0,16.0,5780,F


In [34]:
from sklearn.model_selection import train_test_split

x,y = df_knn.loc[:,df_knn.columns != 'Gender'], df_knn.loc[:,'Gender']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3, random_state = 42)

In [35]:
from sklearn.model_selection import cross_val_score

def print_metrics(model, X_train):
    scores = cross_val_score(model, X_train, y_train, cv=10)
    print('\n*** KNN Accuracy *** \n   {:.7f}'.format(scores.mean()))

In [36]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(x_train,y_train)
print_metrics(knn, x_train)


*** KNN Accuracy *** 
   0.6978379


## 3. 파라미터 튜닝

In [37]:
grid_params = {
    'n_neighbors' : list(range(1,20)),
    'weights' : ["uniform", "distance"],
    'metric' : ['euclidean', 'manhattan', 'mahalanobis']
}

In [38]:
from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(knn, grid_params, cv=10)
gs.fit(x_train, y_train)
print("Best Parameters : ", gs.best_params_)
print("Best Score : ", gs.best_score_)
print("Best Test Score : ", gs.score(x_test, y_test))

Best Parameters :  {'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'uniform'}
Best Score :  0.7447138763814982
Best Test Score :  0.7473333333333333


## 4. 성능을 더 높이기 위해 Scaling

In [39]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [40]:
# Standard Scaler
ss = StandardScaler() # Scaling
x_train_s = pd.DataFrame(ss.fit_transform(x_train), columns = x_train.columns)
x_test_s = pd.DataFrame(ss.transform(x_test), columns = x_test.columns)
x_train_s.head()

Unnamed: 0,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,-1.246645,1.18426,-0.087529,1.100304,0.36109,-0.101807
1,0.312107,-0.844409,-0.087529,-0.292697,0.36109,-0.425756
2,-0.155519,-0.844409,-0.087529,1.100304,0.36109,-0.79175
3,-1.246645,1.18426,-1.104726,1.564637,0.74017,0.570318
4,-1.09077,1.18426,-0.087529,-0.292697,0.36109,-0.800159


In [41]:
# Minmax Scaler
ms = MinMaxScaler()
x_train_m = pd.DataFrame(ms.fit_transform(x_train), columns = x_train.columns)
x_test_m = pd.DataFrame(ms.transform(x_test), columns = x_test.columns)
x_train_m.head()

Unnamed: 0,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,0.0,1.0,0.210526,0.75,0.866667,0.366124
1,0.5,0.0,0.210526,0.375,0.866667,0.298423
2,0.35,0.0,0.210526,0.75,0.866667,0.221934
3,0.0,1.0,0.0,0.875,0.933333,0.50659
4,0.05,1.0,0.210526,0.375,0.866667,0.220177


In [42]:
knn_s = KNeighborsClassifier()
gs_s = GridSearchCV(knn_s, grid_params, cv=10)
gs_s.fit(x_train_s, y_train)
print("Best Parameters : ", gs_s.best_params_)
print("Best Score : ", gs_s.best_score_)
print("Best Test Score : ", gs_s.score(x_test_s, y_test))

Best Parameters :  {'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'uniform'}
Best Score :  0.7449938600081867
Best Test Score :  0.7493333333333333


In [43]:
knn_m = KNeighborsClassifier()
gs_m = GridSearchCV(knn_m, grid_params, cv=10)
gs_m.fit(x_train_m, y_train)
print("Best Parameters : ", gs_m.best_params_)
print("Best Score : ", gs_m.best_score_)
print("Best Test Score : ", gs_m.score(x_test_m, y_test))

Best Parameters :  {'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'uniform'}
Best Score :  0.7424167007777324
Best Test Score :  0.7493333333333333


##### Score의 결과가 가장 좋은 Standard Scaled Data의 {'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'uniform'} parameters를 선택

In [44]:
# Final Model
knn_s = KNeighborsClassifier(metric= 'manhattan', n_neighbors= 19, weights= 'uniform')
knn_s.fit(x_train_s, y_train)
print_metrics(knn_s, x_train_s)


*** KNN Accuracy *** 
   0.7449939


## 5. Evaluation

In [45]:
from sklearn.metrics import confusion_matrix

def print_test_metrics(model, X_test):
    print('*** Test Accuracy *** \n   {}'.format(model.score(X_test, y_test)))
    print('\n*** Confusion Matrix *** \n', confusion_matrix(y_test, model.predict(X_test)))

In [46]:
print_test_metrics(knn_s, x_test_s)

*** Test Accuracy *** 
   0.7493333333333333

*** Confusion Matrix *** 
 [[  23  341]
 [  35 1101]]
