In [1]:
# constant
DF_PATH = "../data/processed/1_preprocessed_df.pkl"

### Libararies

In [2]:


import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import gensim
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

from sklearn.linear_model import Lasso 
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer , ColumnTransformer

# pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error,r2_score



import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

from autocorrect import Speller
spell = Speller()

from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin

import re


### Functions

In [3]:
# w2v preprocessing
def _step_one(X):
     X = X.apply(lambda x: gensim.utils.simple_preprocess(x))
     return X
first_pre = FunctionTransformer(_step_one)

In [4]:
# bow preprocessing
TEXT_CLEANING_RE = "[^A-Za-z0-9]"
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

# for every row
def preprocess(text, stem=True):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(spell(token)))
            else:
                tokens.append(token)
    return " ".join(tokens)


def _pre_all(data,stem = True):
    X = data.apply(lambda x : preprocess(x))
  
    return X
    
pre_all = FunctionTransformer(_pre_all)





#### best hyperparameters from last tuning

In [5]:
bow_param = {
 'vectorizer': CountVectorizer(ngram_range=(2, 2)),
 'vectorizer__analyzer': 'word',
 'vectorizer__ngram_range': (1, 1),
 'vec2': TfidfTransformer()}
regressor_param = {'regressor': RandomForestRegressor(),
 'regressor__ccp_alpha': 0.0} 

### read the dataFrame

In [6]:
#read the dataFrame
# Read Data 
df = pd.read_pickle(DF_PATH)
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,US,"Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
1,US,"Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opu...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling (Lake Michigan Shore),Riesling,St. Julian
2,US,"Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rus...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child Block Pinot Noir (Willamette Valley),Pinot Noir,Sweet Cheeks
3,US,"Soft, supple plum envelopes an oaky structure in this Cabernet, supported by 15% Merlot. Coffee ...",Mountain Cuvée,87,19.0,California,Napa Valley,Napa,Virginie Boone,@vboone,Kirkland Signature 2011 Mountain Cuvée Cabernet Sauvignon (Napa Valley),Cabernet Sauvignon,Kirkland Signature
4,US,"Slightly reduced, this wine offers a chalky, tannic backbone to an otherwise juicy explosion of ...",,87,34.0,California,Alexander Valley,Sonoma,Virginie Boone,@vboone,Louis M. Martini 2012 Cabernet Sauvignon (Alexander Valley),Cabernet Sauvignon,Louis M. Martini


In [7]:
X_train, X_test, y_train, y_test = train_test_split (df['description'], df['points'] , test_size=0.2)

### how to represent sentence by words vectors ?

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
class preprocess_w2v(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        self.model = gensim.models.Word2Vec(X,
                                   vector_size=300,
                                   window=5,
                                   min_count=1)
        self.words =  set(self.model.wv.index_to_key)
        return self
    def transform(self, X):
        X_vecs = np.array([np.array([self.model.wv[i] for i in ls if i in self.words]) for ls in X], dtype=object)
        X = np.array([vs.mean(axis = 0) if vs.size else np.zeros(100, dtype=float) for vs in X_vecs])
        return np.array(X)

### BOW + W2V   


> use the best BOW,regressor hyperparameters from last tuning 

In [9]:
# bow base pipeline
pre_bow = Pipeline([
    ('preprocess',pre_all),
    ('vectorizer',CountVectorizer()),
    ('vec2' , TfidfTransformer())
])

# set params
pre_bow.set_params(**bow_param)

# w2v base pipeline
pre_w2v = make_pipeline(first_pre,preprocess_w2v())

In [10]:
# bow base pipeline

# transformer to compine the tow different data and make the true transform.
# define the transformer
class preprocess_comp(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        self.bow =  pre_bow.fit(X,y)
        self.w2v = pre_w2v.fit(X,y)
        return self
    def transform(self, X):
        X = np.hstack(((self.bow.transform(X)).toarray(),(self.w2v.transform(X))))
        return X

In [11]:
model = Pipeline([('pre',preprocess_comp()),('regressor',Lasso())]) # base
model.set_params(**regressor_param)
model.fit(X_train,y_train)

In [12]:
print('w2v_base_model in mean_squared_error train set ',mean_squared_error(y_train,model.predict(X_train)))
print('w2v_base_model in mean_squared_error test set ',mean_squared_error(y_test,model.predict(X_test)))

w2v_base_model in mean_squared_error train set  0.6311162577294546
w2v_base_model in mean_squared_error test set  4.465975624470855


### we can play with ccp_alpha or hyperparameters of random forest to prevent the overfitting