# Regression

In [8]:
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix

In [9]:
filename = "cleaned_dataset.csv"

df = pd.read_csv(filename)

print(df)
print(df.dtypes)

      is_clean                                       product_name  \
0            1                        Hydrating Lip Treatment Oil   
1            1                  Skin-Enhancing Tinted Moisturizer   
2            1   Lash-Amplifying Volumizing & Lengthening Mascara   
3            1           Skin Melt Talc-Free Loose Setting Powder   
4            1                    Light-Catching Highlighter Balm   
...        ...                                                ...   
1189         0                  Lash Clash Extreme Volume Mascara   
1190         0                     The Bold High Pigment Lipstick   
1191         0     NU LIP & CHEEK BALMY TINT with Hyaluronic Acid   
1192         0  NU GLOW IN BALM Face Priming Moisturizer with ...   
1193         0  NU BLOTTING LOTION Pore Minimizer & Mattifying...   

              brand_name  size  loves_count  number_of_reviews  rating  \
0               Ami Colé  0.15        22871                397  4.3401   
1               Ami Col

In [10]:
print(df['is_clean'].value_counts())

0    1001
1     193
Name: is_clean, dtype: int64


In [11]:
# separate classes
df_majority = df[df['is_clean'] == 0]
df_minority = df[df['is_clean'] == 1]

# upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True, 
                                 n_samples=len(df_majority), 
                                 random_state=42)

# combine majority class with upsampled minority class
data_upsampled = pd.concat([df_majority, df_minority_upsampled])

print(data_upsampled['is_clean'].value_counts())

0    1001
1    1001
Name: is_clean, dtype: int64


In [12]:
# separate features and target
numerical_features = ['size', 'loves_count', 'number_of_reviews', 'rating', 'list_price']
print(data_upsampled[numerical_features].dtypes)
X = data_upsampled[numerical_features]
y = data_upsampled['is_clean']

# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
clf = LogisticRegression(random_state=42)
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

size                 float64
loves_count            int64
number_of_reviews      int64
rating               float64
list_price           float64
dtype: object
Accuracy: 0.5910224438902744
Confusion Matrix:
 [[ 87 115]
 [ 49 150]]

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.43      0.51       202
           1       0.57      0.75      0.65       199

    accuracy                           0.59       401
   macro avg       0.60      0.59      0.58       401
weighted avg       0.60      0.59      0.58       401

