In [12]:
# constant
DF_PATH = "../data/processed/1_preprocessed_df.pkl"
DF_EXPORT = "../data/processed/04_bow_cv_results_df.pkl"

In [2]:

import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import re

# preprocess
from sklearn.compose import make_column_transformer , ColumnTransformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Lasso , LassoCV , Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor



from sklearn.model_selection import cross_val_score


# metric
from sklearn.metrics import make_scorer,mean_squared_error
scoring = make_scorer(mean_squared_error) 

# pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
from autocorrect import Speller
spell = Speller()
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin


In [3]:
# Read Data 
df = pd.read_pickle(DF_PATH)
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
1,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
2,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
3,US,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,87,19.0,California,Napa Valley,Napa,Virginie Boone,@vboone,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature
4,US,"Slightly reduced, this wine offers a chalky, t...",,87,34.0,California,Alexander Valley,Sonoma,Virginie Boone,@vboone,Louis M. Martini 2012 Cabernet Sauvignon (Alex...,Cabernet Sauvignon,Louis M. Martini


 ## use the text features
* a text-based model using a bag-of-words approach
> only work in the description feature as text feature, may be use more of text features in future.
#### BOW

##### Sample to test and tuning

In [5]:
sample = df.sample(6000) # small sample to avoid much time of computing
X_train,X_test,y_train,y_test = train_test_split(sample['description'] , sample['points'] ,  random_state=42)

### preprocessing:
   - clean text
   - drop stop words
   - correct the spelling
   - root of the word
- convert the function to column transformer ( to play with Pipelines )
   

In [6]:
TEXT_CLEANING_RE = "[^A-Za-z0-9]"
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

# for every row
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(spell(token)))
            else:
                tokens.append(token)
    return " ".join(tokens)


# for the DataFrame
def _pre_all(data,stem = True):
    X = data.apply(lambda x : preprocess(x))
  
    return X
    
pre_all = FunctionTransformer(_pre_all)

In [21]:
# test a model with the default hyperparametrs
model = make_pipeline(pre_all,CountVectorizer(lowercase=False,analyzer = 'word'),RandomForestRegressor())
score = cross_val_score(model, X_train,y_train, scoring = scoring)
score

array([5.78784589, 5.54471167, 5.65396678, 5.88259122, 5.64428967])

cross val score to test the sample, it is stable and we can do tuning with it or not 

In [8]:
model.fit(X_train,y_train)

In [10]:
print(mean_squared_error(y_test,model.predict(X_test)))

5.5946744


> looks good what about if we tuning 

### Hyperparameter tuning
### BOW + tuning

In [7]:
# base pipline
pipe = Pipeline([
    ('preprocess',pre_all),
    ('vectorizer',CountVectorizer()),
    ('vec2' , TfidfTransformer() ),
     ('regressor', Lasso())
])


param_grid = [{'regressor': [RandomForestRegressor()],
               'regressor__ccp_alpha': np.linspace(0.0, 0.0030, 5),
              # 'regressor__max_iter' : [500,600],
               'vectorizer': [CountVectorizer()],
               'vec2' : [TfidfTransformer() , 'passthrough'],
              'vectorizer__analyzer' : ['word'],
              'vectorizer__ngram_range' : [(1, 1),(2, 2)]
              }]

grid = GridSearchCV(pipe, param_grid ,n_jobs=-2 , scoring = scoring , cv = 3)
#with config_context(target_offload="gpu:0"):
grid.fit(X_train,y_train)
print(' mean_squared_error on train set = ', grid.score(X_train, y_train))
print(' mean_squared_error on test set = ', grid.score(X_test, y_test))

 mean_squared_error on train set =  0.9039793777777777
 mean_squared_error on test set =  11.815014933333334


In [9]:
print(grid.best_params_)

{'regressor': RandomForestRegressor(), 'regressor__ccp_alpha': 0.0, 'vec2': TfidfTransformer(), 'vectorizer': CountVectorizer(ngram_range=(2, 2)), 'vectorizer__analyzer': 'word', 'vectorizer__ngram_range': (2, 2)}


In [10]:
results = pd.DataFrame(grid.cv_results_)


In [31]:
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor,param_regressor__ccp_alpha,param_vec2,param_vectorizer,param_vectorizer__analyzer,param_vectorizer__ngram_range,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,247.378961,45.57337,0.451127,0.1979589,"(DecisionTreeRegressor(max_features=1.0, rando...",0.0,TfidfTransformer(),CountVectorizer(),word,"(1, 1)",{'regressor': (DecisionTreeRegressor(max_featu...,5.558518,5.217778,5.473449,5.416581,0.144802,20
1,1963.892849,328.014804,0.378986,1.94668e-07,"(DecisionTreeRegressor(max_features=1.0, rando...",0.0,TfidfTransformer(),CountVectorizer(),word,"(2, 2)",{'regressor': (DecisionTreeRegressor(max_featu...,11.865637,12.348694,12.145046,12.119793,0.198014,1
2,181.895121,37.384278,0.452124,0.2099526,"(DecisionTreeRegressor(max_features=1.0, rando...",0.0,passthrough,CountVectorizer(),word,"(1, 1)",{'regressor': (DecisionTreeRegressor(max_featu...,5.993299,5.509117,5.799277,5.767231,0.198961,13
3,2021.725155,210.820318,0.441818,0.007463274,"(DecisionTreeRegressor(max_features=1.0, rando...",0.0,passthrough,CountVectorizer(),word,"(2, 2)",{'regressor': (DecisionTreeRegressor(max_featu...,8.186887,7.769238,7.644559,7.866895,0.231923,9
4,177.987573,8.485146,0.74667,0.01741431,"(DecisionTreeRegressor(max_features=1.0, rando...",0.00075,TfidfTransformer(),CountVectorizer(),word,"(1, 1)",{'regressor': (DecisionTreeRegressor(max_featu...,5.647237,5.253186,5.452269,5.450897,0.160874,16
5,1748.008976,177.718038,0.381646,0.007909153,"(DecisionTreeRegressor(max_features=1.0, rando...",0.00075,TfidfTransformer(),CountVectorizer(),word,"(2, 2)",{'regressor': (DecisionTreeRegressor(max_featu...,11.98553,12.212752,11.985463,12.061248,0.107129,2
6,193.961179,52.196299,0.438162,0.1947161,"(DecisionTreeRegressor(max_features=1.0, rando...",0.00075,passthrough,CountVectorizer(),word,"(1, 1)",{'regressor': (DecisionTreeRegressor(max_featu...,5.977331,5.493467,5.841197,5.770665,0.203735,12
7,1677.723979,188.273462,0.397271,0.00409892,"(DecisionTreeRegressor(max_features=1.0, rando...",0.00075,passthrough,CountVectorizer(),word,"(2, 2)",{'regressor': (DecisionTreeRegressor(max_featu...,8.252215,7.771755,7.602735,7.875568,0.275123,7
8,177.651805,38.312734,0.531246,0.1591701,"(DecisionTreeRegressor(max_features=1.0, rando...",0.0015,TfidfTransformer(),CountVectorizer(),word,"(1, 1)",{'regressor': (DecisionTreeRegressor(max_featu...,5.609613,5.193651,5.49645,5.433238,0.1756,19
9,1237.050721,53.881521,0.362032,0.007099025,"(DecisionTreeRegressor(max_features=1.0, rando...",0.0015,TfidfTransformer(),CountVectorizer(),word,"(2, 2)",{'regressor': (DecisionTreeRegressor(max_featu...,11.924819,12.30768,11.88218,12.038226,0.191326,3


In [32]:
params = results.sort_values(by=['mean_test_score'],ascending=True).reset_index().loc[0,'params']
params

{'regressor': RandomForestRegressor(),
 'regressor__ccp_alpha': 0.0,
 'vec2': TfidfTransformer(),
 'vectorizer': CountVectorizer(),
 'vectorizer__analyzer': 'word',
 'vectorizer__ngram_range': (1, 1)}

### fit it in the all dataset

In [22]:
X_train,X_test,y_train,y_test = train_test_split(df['description'] , df['points'] ,  random_state=42)

In [26]:
pipe = Pipeline([
    ('preprocess',pre_all),
    ('vectorizer',CountVectorizer()),
    ('vec2' , TfidfTransformer() ),
     ('regressor', Lasso())
])

pipe.set_params(**params)

pipe.fit(X_train, y_train)

In [27]:
print('model in mean_squared_error train set ',mean_squared_error(y_train,pipe.predict(X_train)))
print('model in mean_squared_error test set ',mean_squared_error(y_test,pipe.predict(X_test)))

model in mean_squared_error train set  0.6385758360713257
model in mean_squared_error test set  4.39661680533536


> overfitting
> ccp_alpha will solve the problem ^^, we may go deeply in the tuning but the best model is not the only goal from the project
> tuning is vary cost process, may we use another techniques in the future like random search 

### test a liner model
- the hyperparameter i get it from a past tuning, avoid tuning again it is a cost process

In [28]:
params_linear = {'regressor': Ridge(),
 'regressor__alpha': 0.0001,
 'vec2': 'passthrough',
 'vectorizer': CountVectorizer(ngram_range=(2, 2)),
 'vectorizer__analyzer': 'word',
 'vectorizer__ngram_range': (1, 1)}

In [29]:
pipe_linear = Pipeline([
    ('preprocess',pre_all),
    ('vectorizer',CountVectorizer()),
    ('vec2' , TfidfTransformer() ),
     ('regressor', Lasso())
])

pipe_linear.set_params(**params_linear)
pipe_linear.fit(X_train, y_train)

In [30]:
print('model in mean_squared_error train set ',mean_squared_error(y_train,pipe_linear.predict(X_train)))
print('model in mean_squared_error test set ',mean_squared_error(y_test,pipe_linear.predict(X_test)))

model in mean_squared_error train set  1.396153243241578
model in mean_squared_error test set  5.089420585579888


> near to nonlinear