In [1]:
# constant
DF_PATH = "../data/processed/1_preprocessed_df.pkl"

### Libararies

In [2]:
import numpy as np 
import pandas as pd 

# preprocess
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler 
from category_encoders import TargetEncoder

from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Lasso , Ridge
from sklearn.tree import DecisionTreeRegressor

from sklearn.dummy import DummyRegressor

# metric : our metric to all approaches
from sklearn.metrics import make_scorer,mean_squared_error
scoring = make_scorer(mean_squared_error) 



### read the dataFrame

In [3]:
# Read Data 
df = pd.read_pickle(DF_PATH)
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
1,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
2,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
3,US,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,87,19.0,California,Napa Valley,Napa,Virginie Boone,@vboone,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature
4,US,"Slightly reduced, this wine offers a chalky, t...",,87,34.0,California,Alexander Valley,Sonoma,Virginie Boone,@vboone,Louis M. Martini 2012 Cabernet Sauvignon (Alex...,Cabernet Sauvignon,Louis M. Martini


### Preprocessing

#### *nulls*
* the base model will work only on non-text features, may treat different based on the approach and model
* prefer to put this step in the preprocessing part




In [4]:
# drop nulls is very good start
df.isnull().sum()

country                      0
description                  0
designation              16287
points                       0
price                      219
province                     0
region_1                   255
region_2                  3688
taster_name              15991
taster_twitter_handle    18708
title                        0
variety                      0
winery                       0
dtype: int64

In [5]:
# drop only with the price, we will use the price in the non-text model
df.dropna(subset = ['price'] , inplace = True )

In [6]:
df.price.isnull().sum()

0

### outliers

In [7]:
# how to detect outlier in this situations ?
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
1,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
2,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
3,US,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,87,19.0,California,Napa Valley,Napa,Virginie Boone,@vboone,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature
4,US,"Slightly reduced, this wine offers a chalky, t...",,87,34.0,California,Alexander Valley,Sonoma,Virginie Boone,@vboone,Louis M. Martini 2012 Cabernet Sauvignon (Alex...,Cabernet Sauvignon,Louis M. Martini


In [8]:
df[df['points'] > 95].shape

(351, 13)

In [9]:
df[df['price'] > 90][['points']]

Unnamed: 0,points
23,86
69,91
115,92
130,95
132,88
...,...
50321,93
50378,90
50381,90
50432,91


In [10]:
# drop outliers of the price
df =  df[(df['price'] < 90) & (df['price'] > 8 )]

### Create a baseline model for predicting wine quality using only non-text features

* selected features for the model
* the base model
* test dummy regressor

In [11]:
selected_features = ['price','province']


In [12]:
df[selected_features].isnull().sum()

price       0
province    0
dtype: int64

In [13]:
X_train,X_test,y_train,y_test = train_test_split(df[selected_features] , df['points'].values , random_state=42)

In [14]:
preprocess_base = make_column_transformer(
    (StandardScaler(), ['price']),
    (TargetEncoder(), ['province'])


)

base_model = make_pipeline(preprocess_base,DecisionTreeRegressor())

scores = cross_val_score(base_model, X_train,y_train,scoring = scoring)
scores

array([6.87212267, 6.6361748 , 6.69820616, 6.65593597, 6.53763832])

In [15]:
# fit the model
base_model.fit(X_train,y_train)

In [16]:
print(mean_squared_error(y_test,base_model.predict(X_test)))

6.787336259435565


> note : our range of points ( the target ) is 20, for that reason near to 7 is a big mean square error 

#### test dummy model

In [17]:
dummy = make_pipeline(preprocess_base,DummyRegressor())
np.mean(cross_val_score(dummy, X_train,y_train,scoring = scoring))

9.530000351827677

>the base predict better than the mean, it's a good start