In [39]:
import altair as alt
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

In [40]:
df = pd.read_csv("cleaned_data.csv")

In [41]:
df = df.drop(columns = 'Unnamed: 0')

In [42]:
df.head()

Unnamed: 0,customer_id,accepts_marketing,ordered_month,location,gender,free_shipping,product_type,skin_type,fv_site,order_url,buy
0,510336833,True,4,"ONTARIO, CANADA",unknown,True,Anti-Aging,Normal to Dry,auto,auto,False
1,544818881,False,6,"BRITISH COLUMBIA, CANADA",unknown,False,Other,Normal to Oily,,,True
2,577330433,False,5,"OREGON, UNITED STATES",female,False,Other,Normal to Dry,,,True
3,584222401,False,5,"NEW JERSEY, UNITED STATES",unknown,False,Other,Normal to Oily,,,False
4,593077569,False,5,"ONTARIO, CANADA",unknown,False,Other,Normal to Dry,,,False


In [43]:
df.isna().any()

customer_id          False
accepts_marketing    False
ordered_month        False
location              True
gender               False
free_shipping        False
product_type         False
skin_type            False
fv_site               True
order_url             True
buy                  False
dtype: bool

In [53]:
# replace NaNs with unknown
df['location'] = df['location'].replace({np.nan: "unknown"})
df['order_url'] = df['order_url'].replace({np.nan:"unknown"})

In [55]:
df['location'].unique()

array(['ONTARIO, CANADA', 'BRITISH COLUMBIA, CANADA',
       'OREGON, UNITED STATES', 'NEW JERSEY, UNITED STATES',
       'QUEENSLAND, AUSTRALIA', 'QUEBEC, CANADA', 'ALBERTA, CANADA',
       'unknown', 'ILLINOIS, UNITED STATES', 'NEW YORK, UNITED STATES',
       'TEXAS, UNITED STATES', 'CALIFORNIA, UNITED STATES',
       'RIO GRANDE DO SUL, BRAZIL', 'SASKATCHEWAN, CANADA',
       'MICHIGAN, UNITED STATES', 'HAWAII, UNITED STATES',
       'NEWFOUNDLAND, CANADA', 'OHIO, UNITED STATES',
       'INDIANA, UNITED STATES', 'WASHINGTON, UNITED STATES',
       'COLORADO, UNITED STATES', 'NORTH CAROLINA, UNITED STATES',
       'VICTORIA, AUSTRALIA', 'NEW BRUNSWICK, CANADA',
       'SOUTH AUSTRALIA, AUSTRALIA', 'SARAWAK, MALAYSIA',
       'MANITOBA, CANADA', 'ARIZONA, UNITED STATES',
       'GEORGIA, UNITED STATES', 'LISBOA, PORTUGAL',
       'WISCONSIN, UNITED STATES', 'PUERTO RICO, UNITED STATES',
       'MASSACHUSETTS, UNITED STATES', 'FLORIDA, UNITED STATES',
       'NUNAVUT, CANADA', 'WEST V

In [45]:
# split the data into features and target

categorical_features = ['accepts_marketing', 'ordered_month', 'gender', 'free_shipping',
                       'product_type', 'skin_type', 'location', 'order_url']
target = df[['buy']]

In [46]:
# preprocess your data: encode categorical features

from sklearn.preprocessing import OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(drop='first'), categorical_features)])


In [47]:
X = df.drop(columns='buy')
y = df[['buy']]
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=123)


In [48]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=123)

In [49]:
lin_reg_model = GridSearchCV(LinearRegression(),
                        param_grid = {'fit_intercept': [True, False]},
                        cv = 5,
                        scoring = 'neg_mean_squared_error')

In [59]:
rgr = make_pipeline(preprocessor,
                        lin_reg_model)

In [51]:
rgr.fit(X_train, y_train.to_numpy().ravel())

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('ohe',
                                                  OneHotEncoder(categories='auto',
                                                                drop='first',
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  ['accepts_marketing',
                                                   'ordered_month', 'gender',
                                                   'free_shipping',
                                                   'product

In [57]:
predicted = rgr.predict(X_valid)
print(f"lin_reg best hyperparams = {lin_reg_model.best_params_}.")
print(f"lin_reg RMSE = {np.sqrt(mean_squared_error(predicted, y_valid['buy'])):.2f}")

ValueError: Found unknown categories ['AUCKLAND, NEW ZEALAND', 'LEÓN, SPAIN', 'MAHARASHTRA, INDIA', 'NAPOLI, ITALY', 'SARAWAK, MALAYSIA', 'CANTERBURY, NEW ZEALAND', 'TREVISO, ITALY', 'RIO GRANDE DO SUL, BRAZIL', 'NEW TERRITORIES, HONG KONG', 'BARCELONA, SPAIN', 'SUFFOLK, UNITED KINGDOM', 'CIUDAD DE MÉXICO, MEXICO', 'BAHIA, BRAZIL', 'SANTA CATARINA, BRAZIL', 'GRANADA, SPAIN', 'MADRID, SPAIN', 'SOUTH AUSTRALIA, AUSTRALIA'] in column 6 during transform

In [66]:
X_train[X_train['location'] == 'AUCKLAND, NEW ZEALAND']

Unnamed: 0,customer_id,accepts_marketing,ordered_month,location,gender,free_shipping,product_type,skin_type,fv_site,order_url


In [67]:
X_valid[X_valid['location'] == 'AUCKLAND, NEW ZEALAND']

Unnamed: 0,customer_id,accepts_marketing,ordered_month,location,gender,free_shipping,product_type,skin_type,fv_site,order_url
278695,945017323567,True,3,"AUCKLAND, NEW ZEALAND",female,False,Anti-Aging,Normal to Dry,,unknown
22387,6515800705,False,9,"AUCKLAND, NEW ZEALAND",andy,False,Redness,Combination,,unknown


In [None]:
predicted[3]

In [None]:
list(y_valid['buy'].astype(int))[3]