In [1]:
import pkg_resources

ls_libs = ["numpy", "pandas", "scipy", "scikit-learn"]
for n_lib in ls_libs:
    vak_ver = pkg_resources.get_distribution(n_lib).version
    print(f"{n_lib}: {vak_ver}")

numpy: 2.2.3
pandas: 2.2.3
scipy: 1.15.2
scikit-learn: 1.6.1


In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

### Q1. 

In [None]:
df = pd.read_csv("set_03_data.csv")
df.head(1)

In [None]:
stat_mean = df["sales_idx"].mean()
stat_std = df["sales_idx"].std()
stat_mean, stat_std

In [None]:
stat_out = stat_mean + 2 * stat_std
stat_out

In [None]:
df_q1 = df.loc[df["sales_idx"] > stat_out, ]
len(df_q1)

In [None]:
df_q1["idx"] = (df_q1["ROM"] / 32) + (df_q1["RAM"] / 2) + (df_q1["n_front_cam"] + df_q1["n_rear_cam"]) + \
(df_q1["battery_c"] / 1000)

In [15]:
round(df_q1["idx"].mean(), 2)

np.float64(11.01)

### Q2.

In [None]:
df = pd.read_csv("set_03_data.csv")
df.head(2)

In [None]:
df_q2 = df.drop(columns = "screen_size")
df_q2.head(1)

In [None]:
arr_q2_nor = StandardScaler().fit_transform(df_q2)
arr_q2_nor[:1, ]

In [None]:
model_kmeans = KMeans(n_clusters = 4, random_state = 123)
model_kmeans.fit(arr_q2_nor)

In [None]:
df_c = pd.DataFrame(model_kmeans.cluster_centers_, columns = df_q2.columns)
df_c

In [None]:
val_c_max = df_c["battery_c"].idxmax()
val_c_max

In [None]:
arr_dist = euclidean_distances(df_c)
arr_dist

In [34]:
round(arr_dist[0, 1:].min(), 2)

np.float64(2.66)

In [None]:
ser_0 = df_c.iloc[0, ]
ser_3 = df_c.iloc[3, ]

In [44]:
round(((ser_0 - ser_3) ** 2).sum() ** 0.5, 2)

np.float64(2.66)

### Q3.

In [45]:
df = pd.read_csv("set_03_data.csv")
df.head(2)

Unnamed: 0,screen_size,ROM,RAM,n_rear_cam,n_front_cam,battery_c,ratings,n_ratings,sales_p,discount_p,sales_idx
0,Very Small,64,2,1,1,1800,4.5,38645,32999,0.17,127.52
1,Small,64,4,2,1,2815,4.5,244,57149,0.04,1.39


`pd.get_dummies()`는 다른 메서드/함수/클래스와 다르게 "columns" 인자에 단일 값을 할당하는 경우에도 반드시 🌟**리스트**🌟 객체를 사용하여 할당해야 한다. 단순 문자열을 할당할 경우 에러가 난다.

그리고 원핫인코딩을 실시할 때 변수명에 띄어쓰기가 있을 수 있는데 `statsmodels` 라이브러리 기반 모델링을 하면서 formula 를 사용하는 경우 변수명에 띄어쓰기를 제거하지 않은 채로 formula를 작성하면 반드시 에러가 발생함. 그리고 이 이슈는 이전 시험에서 응시자가 어려움을 겪은 사례가 있음.  
※ 다음의 코드 결과에서는 "screen_size_Very Large"  
※ "screen_size_Very Large" -> "screen_size_Very_Large"

In [50]:
# df_dum = pd.get_dummies(df, columns = ["screen_size"]) # 시험버전
df_dum = pd.get_dummies(df, columns = ["screen_size"], dtype = "int") # 최신버전
df_dum = df_dum.set_index("sales_idx").reset_index() # 필수 아님.
df_dum.head(2)

Unnamed: 0,sales_idx,ROM,RAM,n_rear_cam,n_front_cam,battery_c,ratings,n_ratings,sales_p,discount_p,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,127.52,64,2,1,1,1800,4.5,38645,32999,0.17,0,0,0,0,1
1,1.39,64,4,2,1,2815,4.5,244,57149,0.04,0,0,1,0,0


In [52]:
df_train, df_test = train_test_split(df_dum, train_size = 0.8, random_state = 123)
len(df_train), len(df_test)

(344, 86)

In [53]:
model_nor = MinMaxScaler().fit(df_train)
arr_train_nor = model_nor.transform(df_train)
arr_test_nor = model_nor.transform(df_test)

In [None]:
arr_train_nor[:1, ]

In [None]:
df_train_nor = pd.DataFrame(arr_train_nor, columns = df_train.columns)
df_train_nor.head(2)

In [None]:
df_minmax = pd.DataFrame([model_nor.data_max_, 
                          model_nor.data_min_],
                         columns = df_train.columns,
                         index = ["max", "min"])
df_minmax

In [None]:
df_train.agg(["max", "min"])

In [None]:
ls_k = [3, 5, 7, 9, 11]
k = ls_k[0]

model_knn = KNeighborsRegressor(n_neighbors = k)
model_knn.fit(X = arr_train_nor[:, 1:], # 종속변수가 첫 번째 변수에 있기 때문⭐
              y = arr_train_nor[:, 0]) 
pred = model_knn.predict(arr_test_nor[:, 1:])
mean_squared_error(y_true = arr_test_nor[:, 0], y_pred = pred) ** 0.5

In [66]:
ls_k = [3, 5, 7, 9, 11]
ls_rmse = []
for k in ls_k:
    model_knn = KNeighborsRegressor(n_neighbors = k)
    model_knn.fit(X = arr_train_nor[:, 1:], # 종속변수가 첫 번째 변수에 있기 때문⭐
                  y = arr_train_nor[:, 0]) 
    pred = model_knn.predict(arr_test_nor[:, 1:])
    val_rmse = mean_squared_error(y_true = arr_test_nor[:, 0], y_pred = pred) ** 0.5
    ls_rmse = ls_rmse + [val_rmse]

In [70]:
ser_rmse = pd.Series(ls_rmse, index = ls_k) # 굳이 Series 객체 쓸 필요 없음. 반복문도 마찬가지.
val_k_best = ser_rmse.idxmin()
val_k_best

np.int64(3)

In [71]:
model_knn.n_neighbors

11

In [None]:
model_knn_best = KNeighborsRegressor(n_neighbors = val_k_best)
model_knn_best.fit(X = arr_train_nor[:, 1:], 
                   y = arr_train_nor[:, 0]) 

In [None]:
df_t1 = pd.DataFrame(dict(ROM = [256], RAM = [6])) # ...
df_t1

In [77]:
df_t1 = df_test.head(1).reset_index(drop = True)
df_t1["ROM"] = 256
df_t1["RAM"] = 6
df_t1["n_rear_cam"] = 4
df_t1["n_front_cam"] = 1
df_t1["battery_c"] = 4000
df_t1["ratings"] = 4.3
df_t1["n_ratings"] = 25000
df_t1["sales_p"] = 85000
df_t1["discount_p"] = 0.05
df_t1["screen_size_Large"] = 1
df_t1["screen_size_Medium"] = 0

In [None]:
df_t1 # 단계 6

In [84]:
# model_nor.transform(df_t1.drop(columns = "sales_idx"))
arr_t1_nor = model_nor.transform(df_t1) # 단계 7
arr_t1_nor

array([[0.0119438 , 0.49206349, 0.45454545, 1.        , 0.        ,
        0.42307692, 0.625     , 0.05308122, 0.51815842, 0.09302326,
        1.        , 0.        , 0.        , 0.        , 0.        ]])

In [86]:
pred_t1 = model_knn_best.predict(arr_t1_nor[:, 1:]) # 첫번째 변수는 sales_idx이기 때문에 제외
pred_t1 # 단계 8

array([0.00132259])

In [90]:
# model_nor.inverse_transform(pred_t1)
# model_nor.inverse_transform([pred_t1])
# model_nor.inverse_transform([pred_t1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
arr_t1_nor[0, 0] = pred_t1

In [94]:
arr_t1_inv = model_nor.inverse_transform(arr_t1_nor)
df_t1_inv = pd.DataFrame(arr_t1_inv, columns = df_t1.columns)
df_t1_inv # 0.65 !!!

Unnamed: 0,sales_idx,ROM,RAM,n_rear_cam,n_front_cam,battery_c,ratings,n_ratings,sales_p,discount_p,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,0.653333,256.0,6.0,4.0,1.0,4000.0,4.3,25000.0,85000.0,0.05,1.0,0.0,0.0,0.0,0.0


In [95]:
df_t1

Unnamed: 0,sales_idx,ROM,RAM,n_rear_cam,n_front_cam,battery_c,ratings,n_ratings,sales_p,discount_p,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,5.9,256,6,4,1,4000,4.3,25000,85000,0.05,1,0,0,0,0
