In [None]:
!pip install scikit-learn 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
wm_df = pd.read_csv("fashionnova_reviews.csv")
wm_df

In [None]:
wm_df["Review date"] = pd.to_datetime(wm_df["Review date"], format="%d-%m-%Y")
wm_df["Review date"] = pd.to_numeric(wm_df["Review date"])

In [None]:
wm_df["Review Date"]

In [None]:
ndf = wm_df[["Review Date", "Review Count", "Rating", "Date Of Experience"]]
ndf.head()

In [None]:
ndf.hist()
plt.show()

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(10, 10))

for i, e in enumerate(ndf[["Review Date", "Review Count", "Rating", "Date Of Experience"]]):
    axs[i // 2, i % 2].scatter(ndf[e], ndf["Date Of Experience"], color="blue")
    axs[i // 2, i % 2].set_xlabel(e)
    axs[i // 2, i % 2].set_ylabel("Date Of Experience")

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(ndf["Date Of Experience"], bins=20, kde=True)
plt.title("Distribution of Date Of Experience")

In [None]:
X = wm_df[["Date", "Review Count", "Rating"]]  
y = wm_df["Date Of Experience"]

print(X.shape)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)
mse

In [None]:
mean_squared_error(y_train, lr_model.predict(X_train))

In [None]:
y_test.shape, y_pred.shape

In [None]:
plt.scatter(y_train, lr_model.predict(X_train))

In [None]:
plt.scatter(y_test, y_pred)


In [None]:
print("Coefficients:", lr_model.coef_)
print("Intercept:", lr_model.intercept_)

In [None]:
from sklearn.datasets import load_fashionnova_reviews

data_fashionnova_reviews = load_fashionnova_reviews()

X_df = pd.DataFrame(data_fashionnova_reviews.data, columns=data_fashionnova_reviews.feature_names)
y_df = pd.DataFrame(data_fashionnova_reviews.target, columns=["target"])

fashionnova_ds = pd.concat([X_df, y_df], axis=1)
fashionnova_ds

In [None]:
fashionnova_ds.columns

In [None]:
cm = fashionnova_ds.corr()

fig, ax = plt.subplots(figsize=(10, 10))
ax.matshow(cm)
plt.xticks(range(len(cm.columns)), cm.columns)
plt.xticks(rotation=90)
plt.yticks(range(len(cm.columns)), cm.columns)

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(cm, ax=ax, cmap="mako")
plt.show()

In [None]:
fashionnova_ds["target"].value_counts()

In [None]:
sns.histplot(fashionnova_ds["target"], kde=True)

In [None]:
sns.histplot(fashionnova_ds["Review Count"], kde=True)

In [None]:
ds_true = fashionnova_ds[fashionnova_ds["target"] == 1]
ds_false = fashionnova_ds[fashionnova_ds["target"] == 0]

fig, ax = plt.subplots(figsize=(10, 6))

sns.histplot(ds_true["Review Count"], kde=True, color="red", ax=ax)
sns.histplot(ds_false["Review Count"], kde=True, color="blue", ax=ax)

fig.legend(labels=["consultar", "consultar"])

In [None]:
ds_true = fashionnova_ds[fashionnova_ds["target"] == 1]
ds_false = fashionnova_ds[fashionnova_ds["target"] == 0]

fig, axs = plt.subplots(2, 2, figsize=(10, 6))

sns.histplot(ds_true["Date"], kde=True, color="red", ax=axs[0, 0])
sns.histplot(ds_false["Date"], kde=True, color="blue", ax=axs[0, 0])

sns.histplot(ds_true["Date Of Experience"], kde=True, color="red", ax=axs[0, 1])
sns.histplot(ds_false["Date Of Experience"], kde=True, color="blue", ax=axs[0, 1])

sns.histplot(ds_true["Review Count"], kde=True, color="red", ax=axs[1, 0])
sns.histplot(ds_false["Review Count"], kde=True, color="blue", ax=axs[1, 0])

sns.histplot(ds_true["Rating"], kde=True, color="red", ax=axs[1, 1])
sns.histplot(ds_false["Rating"], kde=True, color="blue", ax=axs[1, 1])
Good= fashionnova_ds["Rating"] >= 4
Bad = fashionnova_ds["Rating"] <= 4
fig.legend(labels=[Good, Bad])

fig.tight_layout()

In [None]:
sns.pairplot(fashionnova_ds, hue="target")

In [None]:
seed = 42

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=seed)

In [None]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model.fit(X_train, y_train.values.ravel())

In [None]:
y_pred = log_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

cm = confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(cm, annot=True, ax=ax)
_ = plt.xlabel("Predicted")
_ = plt.ylabel("Actual")

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipe = Pipeline([
    ("scaler", StandardScaler()), 
    ("logistic", LogisticRegression())
])

pipe.fit(X_train, y_train.values.ravel())

y_pred_scale = pipe.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_scale)
cm = confusion_matrix(y_test, y_pred_scale)

print("Accuracy:", accuracy)

fig, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(cm, annot=True, ax=ax)
_ = plt.xlabel("Predicted")
_ = plt.ylabel("Actual")

In [None]:
print("Coefficients:", log_model.coef_)
print("Intercept:", log_model.intercept_)

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))

ax.barh(X_df.columns, log_model.coef_[0])

In [None]:
import numpy as np

fig, ax = plt.subplots(figsize=(10, 8))

feature_importances = pd.DataFrame(
    {"column": X_df.columns, "coef": np.abs(log_model.coef_[0])}
).sort_values(by="coef", ascending=True)

ax.barh(feature_importances["column"], feature_importances["coef"])