In [9]:
import pandas as pd
import numpy as np
import re

import collections

import os

from sklearn.model_selection import  train_test_split
from sklearn.pipeline import Pipeline

import pickle

import gzip


In [2]:
os.chdir('../')
from functions.featurization import (saint_words,
                                    stop_words,
                                    location_info_features,
                                    Preprocess_Data,
                                    variety_tokenizer,
                                    individual_text_processing,
                                    text_tokenizer,
                                    Create_Simple_Onehot, 
                                    Create_Multigrams,
                                    Merge_Similar_Columns,
                                    Normalize_Points,
)

[nltk_data] Downloading package punkt to /Users/willtong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df = pd.read_json("data/mod_winemag-data-130k-v2.json")
df = df.replace("[+]", "Plus") 
df = df.replace("[", "") 
df = df.replace("]", "") 
# df[df["designation"].fillna('xxxxx').apply(lambda text: '+' in text or '[' in text or ']' in text)]["designation"]

In [4]:
X_train, X_holdout, y_train, y_holdout = train_test_split(df.drop(["price"], axis = 1), 
                                                          df["price"], test_size = 0.1, train_size=0.9, 
                                                          random_state = 5)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, train_size=0.9, random_state = 5)


In [5]:

winery_threshold_frac = 0.0002
winery_threshold_count = int(winery_threshold_frac*X_train.shape[0]+0.5)
designation_threshold_frac = 0.00010
designation_threshold_count = int(winery_threshold_frac*X_train.shape[0]+0.5)
title_threshold_frac = 0.001
title_threshold_count = int(title_threshold_frac*X_train.shape[0]+0.5)
variety_threshold_frac = 0.001
variety_threshold_count = int(variety_threshold_frac*X_train.shape[0]+0.5)
taster_threshold_frac = 0.00001
taster_threshold_count = int(taster_threshold_frac*X_train.shape[0]+0.5)

featurize_pipeline = Pipeline(
    steps = [
             ('preprocess_data', Preprocess_Data(columns_to_drop = ['taster-twitter-handle', 'description'],
                                                 columns_to_impute_unknown = ['taster-name',
                                                 ])),
             ('norm_points', Normalize_Points(point_feature = 'points', groupby_feature = 'taster-name')),
        
             ("title_tokenizer", Create_Multigrams(tokenizer=text_tokenizer, 
                                                   feature = 'title', 
                                                   threshold_frac = title_threshold_frac)),
             ("variety_tokenizer", Create_Multigrams(tokenizer=variety_tokenizer, 
                                                     feature = 'variety',
                                                     threshold_frac = variety_threshold_frac
                                                    )), 

             ('winery_name_onehot', Create_Simple_Onehot(feature ="winery", 
                                                    threshold_frac = winery_threshold_frac)),
             ('designation_onehot', Create_Simple_Onehot(feature ="designation", 
                                                         threshold_frac = designation_threshold_frac)),
             ('region-1_onehot', Create_Simple_Onehot(feature ="region-1", 
                                                         threshold_frac = designation_threshold_frac)),
        
             ('region-2_onehot', Create_Simple_Onehot(feature ="region-2", 
                                                         threshold_frac = designation_threshold_frac)),
        
             ('province_onehot', Create_Simple_Onehot(feature ="province", 
                                                         threshold_frac = designation_threshold_frac)),
        
             ('country_onehot', Create_Simple_Onehot(feature ="country", 
                                                         threshold_frac = designation_threshold_frac)),
             ('taster-name_onehot', Create_Simple_Onehot(feature ="taster-name", 
                                                         threshold_frac = designation_threshold_frac)),
             ('merge_similar_columns', Merge_Similar_Columns()),
            ]
)
featurize_pipeline.fit(X_train)



Fitting normalized_points for point_features = points, groupby taster-name
Calculating normalized point for each taster based on training data
Creating multigrams for feature = title
feature tokenized = title, output dataframe shape = (97989, 701)
Creating multigrams for feature = variety
feature tokenized = variety, output dataframe shape = (97989, 788)
Creating simple onehot for feature = winery
feature onehotted = winery, output dataframe shape = (97989, 1907)
Creating simple onehot for feature = designation
feature onehotted = designation, output dataframe shape = (97989, 2427)
Creating simple onehot for feature = region-1
feature onehotted = region-1, output dataframe shape = (97989, 3011)
Creating simple onehot for feature = region-2
feature onehotted = region-2, output dataframe shape = (97989, 3026)
Creating simple onehot for feature = province
feature onehotted = province, output dataframe shape = (97989, 3235)
Creating simple onehot for feature = country
feature onehotted = c

In [6]:
X_train_transformed = featurize_pipeline.transform(X_train)

Calculating normalized point for each taster based on training data
feature tokenized = title, output dataframe shape = (97989, 701)
feature tokenized = variety, output dataframe shape = (97989, 788)
feature onehotted = winery, output dataframe shape = (97989, 1907)
feature onehotted = designation, output dataframe shape = (97989, 2427)
feature onehotted = region-1, output dataframe shape = (97989, 3011)
feature onehotted = region-2, output dataframe shape = (97989, 3026)
feature onehotted = province, output dataframe shape = (97989, 3235)
feature onehotted = country, output dataframe shape = (97989, 3266)
feature onehotted = taster-name, output dataframe shape = (97989, 3283)
Merging the following raw columns: {'garnacha', 'red', 'nero', 'verdot', 'new mexico', 'muscat', 'veltliner', 'western australia', 'sec', 'oregon', 'alta', 'chianti', 'blaufränkisch', 'crianza', 'gewürztraminer', 'petite', 'missouri', 'portugal', 'made with organic grapes', 'riesling', 'bianchi', 'victoria', 'vir

In [7]:
X_holdout_transformed = featurize_pipeline.transform(X_holdout)

Calculating normalized point for each taster based on training data
feature tokenized = title, output dataframe shape = (12098, 701)
feature tokenized = variety, output dataframe shape = (12098, 788)
feature onehotted = winery, output dataframe shape = (12098, 1907)
feature onehotted = designation, output dataframe shape = (12098, 2427)
feature onehotted = region-1, output dataframe shape = (12098, 3011)
feature onehotted = region-2, output dataframe shape = (12098, 3026)
feature onehotted = province, output dataframe shape = (12098, 3235)
feature onehotted = country, output dataframe shape = (12098, 3266)
feature onehotted = taster-name, output dataframe shape = (12098, 3283)
Merging the following raw columns: {'garnacha', 'red', 'nero', 'verdot', 'new mexico', 'muscat', 'veltliner', 'western australia', 'sec', 'oregon', 'alta', 'chianti', 'blaufränkisch', 'crianza', 'gewürztraminer', 'petite', 'missouri', 'portugal', 'made with organic grapes', 'riesling', 'bianchi', 'victoria', 'vir

In [8]:
X_val_transformed = featurize_pipeline.transform(X_val)

Calculating normalized point for each taster based on training data
feature tokenized = title, output dataframe shape = (10888, 701)
feature tokenized = variety, output dataframe shape = (10888, 788)
feature onehotted = winery, output dataframe shape = (10888, 1907)
feature onehotted = designation, output dataframe shape = (10888, 2427)
feature onehotted = region-1, output dataframe shape = (10888, 3011)
feature onehotted = region-2, output dataframe shape = (10888, 3026)
feature onehotted = province, output dataframe shape = (10888, 3235)
feature onehotted = country, output dataframe shape = (10888, 3266)
feature onehotted = taster-name, output dataframe shape = (10888, 3283)
Merging the following raw columns: {'garnacha', 'red', 'nero', 'verdot', 'new mexico', 'muscat', 'veltliner', 'western australia', 'sec', 'oregon', 'alta', 'chianti', 'blaufränkisch', 'crianza', 'gewürztraminer', 'petite', 'missouri', 'portugal', 'made with organic grapes', 'riesling', 'bianchi', 'victoria', 'vir

In [10]:
with gzip.open('data/gzipX_train_transformed.pckl', 'wb') as f:
    pickle.dump(X_train_transformed, f)

with gzip.open('data/gzipX_holdout_transformed.pckl', 'wb') as f:
    pickle.dump(X_holdout_transformed, f)

with gzip.open('data/gzipX_val_transformed.pckl', 'wb') as f:
    pickle.dump(X_val_transformed, f)

with gzip.open('data/gzipy_train.pckl', 'wb') as f:
    pickle.dump(y_train, f)

with gzip.open('data/gzipy_val.pckl', 'wb') as f:
    pickle.dump(y_val, f)

with gzip.open('data/gzipy_holdout.pckl', 'wb') as f:
    pickle.dump(y_holdout, f)

In [10]:
import gzip

In [None]:
with gzip.open('data/X_train_transformed.pckl', 'wb') as f:
    pickle.dump(X_train_transformed, f)