In [1]:
import pkg_resources

ls_libs = ["numpy", "pandas", "scipy", "scikit-learn"]
for n_lib in ls_libs:
    vak_ver = pkg_resources.get_distribution(n_lib).version
    print(f"{n_lib}: {vak_ver}")

numpy: 2.2.3
pandas: 2.2.3
scipy: 1.15.2
scikit-learn: 1.6.1


In [2]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor

## Set 1

In [3]:
df = pd.read_csv("set_01_data.csv")
df.head(1)

Unnamed: 0,Id,Stars,Ratings,Reviews,current_price,MRP,channel,Operating_system,Picture_quality,Speaker,Frequency
0,TCL_016,4.3,0,0,39990,89990,Netflix|Prime Video|Disney+Hotstar|Youtube,Operating System: Android,Ultra HD (4K) 3840 x 2160 Pixels,30 W Speaker Output,60 Hz Refresh Rate


### Q1.

In [4]:
df_q1 = df[["Frequency", "Picture_quality", "Speaker"]].copy()
df_q1.head(2)

Unnamed: 0,Frequency,Picture_quality,Speaker
0,60 Hz Refresh Rate,Ultra HD (4K) 3840 x 2160 Pixels,30 W Speaker Output
1,1 Year Warranty on Product,60 Hz Refresh Rate,2 x HDMI | 2 x USB


In [5]:
# ser_u = df_q1["Frequency"].drop_duplicates()
# ser_u = df_q1["Picture_quality"].drop_duplicates()
ser_u = df_q1["Speaker"].drop_duplicates()
# ser_u[ser_u.str.contains("Hz")]
ser_u[ser_u.str.contains("[0-9]{2,3} Hz")]

4       50 Hz Refresh Rate
33      60 Hz Refresh Rate
38     200 Hz Refresh Rate
39     120 Hz Refresh Rate
353    100 Hz Refresh Rate
Name: Speaker, dtype: object

In [6]:
ser = pd.Series(["a", "b", "b"])
ser.str.contains("a")

0     True
1    False
2    False
dtype: bool

In [7]:
ser.str.contains("a").astype("int")

0    1
1    0
2    0
dtype: int64

In [8]:
ser.str.contains("a") + 0

0    1
1    0
2    0
dtype: int64

In [9]:
df_q1["hz_ck1"] = df_q1["Frequency"].str.contains("Hz") + 0
df_q1["hz_ck2"] = df_q1["Picture_quality"].str.contains("Hz") + 0
df_q1["hz_ck3"] = df_q1["Speaker"].str.contains("Hz") + 0

In [10]:
df_q1["hz_ck_sum"] = df_q1["hz_ck1"] + df_q1["hz_ck2"] + df_q1["hz_ck3"]
df_q1["hz_ck_sum"].value_counts()

hz_ck_sum
1    659
0      7
Name: count, dtype: int64

In [11]:
df_q1_sub = df_q1.loc[df_q1["hz_ck_sum"] != 0, ]
len(df_q1_sub)

659

In [12]:
df_q1_sub.head()

Unnamed: 0,Frequency,Picture_quality,Speaker,hz_ck1,hz_ck2,hz_ck3,hz_ck_sum
0,60 Hz Refresh Rate,Ultra HD (4K) 3840 x 2160 Pixels,30 W Speaker Output,1,0,0,1
1,1 Year Warranty on Product,60 Hz Refresh Rate,2 x HDMI | 2 x USB,0,1,0,1
2,60 Hz Refresh Rate,Full HD 1920 x 1080 Pixels,20 W Speaker Output,1,0,0,1
3,60 Hz Refresh Rate,Ultra HD (4K) 3840 x 2160 Pixels,50 W Speaker Output,1,0,0,1
4,3 x HDMI | 2 x USB,20 W Speaker Output,50 Hz Refresh Rate,0,0,1,1


In [13]:
ser_cnt = df_q1_sub.loc[:, :"Speaker"].apply(lambda x: x.str.contains("60 Hz").sum(), axis = 1)
ser_cnt.value_counts() # 493!!!

1    493
0    166
Name: count, dtype: int64

### Q2.

In [14]:
df = pd.read_csv("set_01_data.csv")
df.head(1)

Unnamed: 0,Id,Stars,Ratings,Reviews,current_price,MRP,channel,Operating_system,Picture_quality,Speaker,Frequency
0,TCL_016,4.3,0,0,39990,89990,Netflix|Prime Video|Disney+Hotstar|Youtube,Operating System: Android,Ultra HD (4K) 3840 x 2160 Pixels,30 W Speaker Output,60 Hz Refresh Rate


In [15]:
df["ck_0"] = df["Stars"] * df["Ratings"] * df["Reviews"]
df["ck_4k"] = df.loc[:, "channel":"Picture_quality"].apply(lambda x: x.str.contains("4K").sum(), axis = 1)

In [16]:
df_q2 = df.loc[(df["ck_0"] != 0) & (df["ck_4k"] != 0), ]
len(df_q2)

113

In [17]:
df_q2_model = df_q2.loc[:, "Stars":"MRP"]

In [18]:
df_q2_model.head(2)

Unnamed: 0,Stars,Ratings,Reviews,current_price,MRP
3,4.4,9687,1532,62999,79999
10,4.5,17462,5306,54999,69999


In [19]:
model_kmeans = KMeans(n_clusters = 3, random_state = 123)
model_kmeans.fit(df_q2_model)

In [20]:
ser_labels = pd.Series(model_kmeans.labels_)
ser_labels.value_counts() # 55!!!

2    55
0    44
1    14
Name: count, dtype: int64

### Q3.

In [25]:
df = pd.read_csv("set_01_data.csv")

In [None]:
df_q3 = df.loc[~df["channel"].str.contains("Pixel|Oper"), ].reset_index(drop = True)
df_q3.head(2)

In [27]:
# df_q3["후기 작성 비율"] = 1
df_q3["x1"] = df_q3["Reviews"] / df_q3["Ratings"]
df_q3["x2"] = df_q3["MRP"]
df_q3["x3"] = df_q3["current_price"] / df_q3["MRP"]
df_q3["x4"] = df_q3["channel"].str.contains("Netflix") + 0
df_q3["x5"] = df_q3["channel"].str.contains("Prime Video") + 0
df_q3["x6"] = df_q3["Picture_quality"].str.contains("4K|8K") + 0
# df_q3["x6"] = (df_q3["Picture_quality"].str.contains("4K") | df_q3["Picture_quality"].str.contains("8K")) + 0

In [28]:
df_model = df_q3[["Stars", "x1", "x2", "x3", "x4", "x5", "x6"]].copy()
df_model.head(2)

Unnamed: 0,Stars,x1,x2,x3,x4,x5,x6
0,4.3,,89990,0.444383,1,1,1
1,0.0,,45900,0.997495,1,0,0


In [None]:
df_model.isna().sum()

In [30]:
df_model = df_model.dropna()
df_model.head(2)

Unnamed: 0,Stars,x1,x2,x3,x4,x5,x6
2,4.4,0.15815,79999,0.787497,1,1,1
6,4.5,0.30386,69999,0.785711,1,1,1


In [None]:
df_model.isna().sum()

In [32]:
len(df_model)

200

In [None]:
model_rf = RandomForestRegressor(random_state = 123)
model_rf.fit(X = df_model.drop(columns = "Stars"), # df_model 객체에서 "Stars" 변수를 뺌.
             y = df_model["Stars"]) 