In [1]:
# constant
DF_PATH = "../data/processed/1_preprocessed_df.pkl"
np_EXPORT1 = "../data/processed/05_w2v_train.npy"
np_EXPORT2 = "../data/processed/05_w2v_test.npy"

In [2]:
import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import gensim
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

from sklearn.linear_model import Lasso 
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer , ColumnTransformer

# pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline


# metric
from sklearn.metrics import mean_squared_error,make_scorer
scoring = make_scorer(mean_squared_error) 


import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

from autocorrect import Speller
spell = Speller()

from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin

import re


### Functions
- preprocess using gensim

In [3]:
def _step_one(X):
     X = X.apply(lambda x: gensim.utils.simple_preprocess(x))
     return X
first_pre = FunctionTransformer(_step_one)

### Reading the DataFrame

In [4]:
#read the dataFrame
# Read Data 
df = pd.read_pickle(DF_PATH)
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,US,"Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
1,US,"Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opu...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling (Lake Michigan Shore),Riesling,St. Julian
2,US,"Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rus...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child Block Pinot Noir (Willamette Valley),Pinot Noir,Sweet Cheeks
3,US,"Soft, supple plum envelopes an oaky structure in this Cabernet, supported by 15% Merlot. Coffee ...",Mountain Cuvée,87,19.0,California,Napa Valley,Napa,Virginie Boone,@vboone,Kirkland Signature 2011 Mountain Cuvée Cabernet Sauvignon (Napa Valley),Cabernet Sauvignon,Kirkland Signature
4,US,"Slightly reduced, this wine offers a chalky, tannic backbone to an otherwise juicy explosion of ...",,87,34.0,California,Alexander Valley,Sonoma,Virginie Boone,@vboone,Louis M. Martini 2012 Cabernet Sauvignon (Alexander Valley),Cabernet Sauvignon,Louis M. Martini


In [5]:
X_train, X_test, y_train, y_test = train_test_split (df['description'], df['points'].values , test_size=0.2)

### how to represent sentence by words vectors ?
- for a sentence, first represent a word by a vector, then averaging them to represent the sentence, apply this for all sentences.

In [6]:

class preprocess_w2v(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None):
        self.model = gensim.models.Word2Vec(X,
                                   vector_size=150,
                                   window=5,
                                   min_count=1)
        self.words =  set(self.model.wv.index_to_key)
        return self
    def transform(self, X):
        X_vecs = np.array([np.array([self.model.wv[i] for i in ls if i in self.words]) for ls in X], dtype=object)
        X = np.array([vs.mean(axis = 0) if vs.size else np.zeros(100, dtype=float) for vs in X_vecs])
        return np.array(X)

In [7]:
model = make_pipeline(first_pre,preprocess_w2v(),RandomForestRegressor())
model.fit(X_train , y_train)
print('model in mean_squared_error train set ',mean_squared_error(y_train,model.predict(X_train)))
print('model in mean_squared_error test set ',mean_squared_error(y_test,model.predict(X_test)))

model in mean_squared_error train set  0.6923331565095205
model in mean_squared_error test set  4.886177020632404


### Tuning
- take a sample from the vectors of w2v model that fitted in the all data, for a tuning purpose

In [8]:
# for tuning purpose
# split w2v and the regressor, to avoid many same fit and transform

data_transform = make_pipeline(first_pre,preprocess_w2v())
train = data_transform.fit_transform(X_train)
new_data = np.hstack((train,y_train.reshape(-1,1)))

In [9]:
data =  new_data[np.random.choice(new_data.shape[0], 6000, replace=False)]
X_tr, X_te, y_tr, y_te = train_test_split (data[:,0:150], data[:,150] , test_size=0.4)

In [10]:
X_tr.shape, y_tr.shape

((3600, 150), (3600,))

In [11]:
# base pipline
pipe = make_pipeline(RandomForestRegressor())

param_grid = [{'randomforestregressor': [RandomForestRegressor()],
               'randomforestregressor__ccp_alpha': np.linspace(0.0, 0.0090, 20)
            
              }]

grid = GridSearchCV(pipe, param_grid ,n_jobs=-2 , scoring = scoring , cv = 3)
#with config_context(target_offload="gpu:0"):
grid.fit(X_tr,y_tr)
print(' mean_squared_error on train set = ', grid.score(X_tr, y_tr))
print(' mean_squared_error on test set = ', grid.score(X_te, y_te))

 mean_squared_error on train set =  0.9031624461298846
 mean_squared_error on test set =  5.480527274674163


In [12]:
results = pd.DataFrame(grid.cv_results_)

In [13]:
params = results.sort_values(by = ['mean_test_score']).reset_index().loc[0,'params']
params

{'randomforestregressor': RandomForestRegressor(ccp_alpha=0.0014210526315789472),
 'randomforestregressor__ccp_alpha': 0.0028421052631578945}

In [14]:
grid.best_params_

{'randomforestregressor': RandomForestRegressor(ccp_alpha=0.0014210526315789472),
 'randomforestregressor__ccp_alpha': 0.0014210526315789472}

the best of grid is not the best in real .!, my be the reason is the metric

### fit again in the all data

In [15]:

model.set_params(**params)
model.fit(X_train , y_train)
print('model in mean_squared_error train set ',mean_squared_error(y_train,model.predict(X_train)))
print('model in mean_squared_error test set ',mean_squared_error(y_test,model.predict(X_test)))

model in mean_squared_error train set  3.6019428124834825
model in mean_squared_error test set  5.168201198367874


#### need some deep tuning 

### save the vectors to use in deffernt models

In [16]:
data_transform = make_pipeline(first_pre,preprocess_w2v())
w2v_train = data_transform.fit_transform(X_train)

In [17]:
# train
print(w2v_train.shape)
w2v_train = np.hstack((w2v_train,y_train.reshape(-1,1)))
w2v_train.shape

(40365, 150)


(40365, 151)

In [18]:
# test
w2v_test = np.hstack((data_transform.transform(X_test),y_test.reshape(-1,1)))
w2v_test.shape

(10092, 151)

In [19]:
np.save(np_EXPORT1,w2v_train)
np.save(np_EXPORT2,w2v_test)