In [1]:
% matplotlib inline
import datetime
from matplotlib import pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd
from sklearn import linear_model, preprocessing, cross_validation, metrics, pipeline, grid_search

In [2]:
sales = pd.read_csv("../../assets/dataset/Iowa_Liquor_sales_sample_10pct.csv")

In [3]:
sales.head()

Unnamed: 0,Date,Store Number,City,Zip Code,County Number,County,Category,Category Name,Vendor Number,Item Number,Item Description,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Volume Sold (Gallons)
0,11/04/2015,3717,SUMNER,50674,9.0,Bremer,1051100.0,APRICOT BRANDIES,55,54436,Mr. Boston Apricot Brandy,750,$4.50,$6.75,12,$81.00,9.0,2.38
1,03/02/2016,2614,DAVENPORT,52807,82.0,Scott,1011100.0,BLENDED WHISKIES,395,27605,Tin Cup,750,$13.75,$20.63,2,$41.26,1.5,0.4
2,02/11/2016,2106,CEDAR FALLS,50613,7.0,Black Hawk,1011200.0,STRAIGHT BOURBON WHISKIES,65,19067,Jim Beam,1000,$12.59,$18.89,24,$453.36,24.0,6.34
3,02/03/2016,2501,AMES,50010,85.0,Story,1071100.0,AMERICAN COCKTAILS,395,59154,1800 Ultimate Margarita,1750,$9.50,$14.25,6,$85.50,10.5,2.77
4,08/18/2015,3654,BELMOND,50421,99.0,Wright,1031080.0,VODKA 80 PROOF,297,35918,Five O'clock Vodka,1750,$7.20,$10.80,12,$129.60,21.0,5.55


In [4]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270955 entries, 0 to 270954
Data columns (total 18 columns):
Date                     270955 non-null object
Store Number             270955 non-null int64
City                     270955 non-null object
Zip Code                 270955 non-null object
County Number            269878 non-null float64
County                   269878 non-null object
Category                 270887 non-null float64
Category Name            270323 non-null object
Vendor Number            270955 non-null int64
Item Number              270955 non-null int64
Item Description         270955 non-null object
Bottle Volume (ml)       270955 non-null int64
State Bottle Cost        270955 non-null object
State Bottle Retail      270955 non-null object
Bottles Sold             270955 non-null int64
Sale (Dollars)           270955 non-null object
Volume Sold (Liters)     270955 non-null float64
Volume Sold (Gallons)    270955 non-null float64
dtypes: float64(4), int64(

In [5]:
sales["Bottles Sold"].describe()

count    270955.000000
mean          9.871285
std          24.040912
min           1.000000
25%           2.000000
50%           6.000000
75%          12.000000
max        2508.000000
Name: Bottles Sold, dtype: float64

In [6]:
sales["Bottle Volume (ml)"].describe()

count    270955.000000
mean        924.830341
std         493.088489
min          50.000000
25%         750.000000
50%         750.000000
75%        1000.000000
max        6000.000000
Name: Bottle Volume (ml), dtype: float64

In [7]:
vol_min = sales["Bottle Volume (ml)"].min()
vol_max = sales["Bottle Volume (ml)"].max()
min_max_bottle = (sales["Bottle Volume (ml)"] - vol_min) / (vol_max - vol_min)

In [8]:
numeric_columns = ["Volume Sold (Liters)", "Bottles Sold", "Bottle Volume (ml)"]


scaler = preprocessing.MinMaxScaler()
scaler = scaler.fit(X=sales[numeric_columns])
X_new = scaler.transform(X=sales[numeric_columns])
X_new

array([[ 0.00354879,  0.00438771,  0.11764706],
       [ 0.00055824,  0.00039888,  0.11764706],
       [ 0.00952989,  0.00917431,  0.15966387],
       ..., 
       [ 0.00175446,  0.00199442,  0.11764706],
       [ 0.0041469 ,  0.00199442,  0.28571429],
       [ 0.00059811,  0.00279218,  0.02521008]])

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin

class Our_MinMaxScaler(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.min = X.min()
        self.max = X.max()
        return self
    
    def transform(self, X):
        X_new = (X - self.min)/ (self.max - self.min)
        return X_new
    
    

In [10]:
scaler = Our_MinMaxScaler()
scaler = scaler.fit(X=sales[numeric_columns])
X_new = scaler.transform(X=sales[numeric_columns])
X_new.head()

Unnamed: 0,Volume Sold (Liters),Bottles Sold,Bottle Volume (ml)
0,0.003549,0.004388,0.117647
1,0.000558,0.000399,0.117647
2,0.00953,0.009174,0.159664
3,0.004147,0.001994,0.285714
4,0.008334,0.004388,0.285714


In [11]:
estimator = pipeline.Pipeline(steps=[('OurMinMaxScaler', Our_MinMaxScaler()),
                                    ('SuperCoolRegression', linear_model.Ridge())])

# estimator = pipeline.makepipeline(Our_MinMaxScaler(), linear_model.Ridge())

In [12]:
estimator = estimator.fit(X=sales[["Volume Sold (Liters)", "Bottle Volume (ml)"]], 
                          y=sales['Bottles Sold'])

In [13]:
estimator.score(X=sales[["Volume Sold (Liters)", "Bottle Volume (ml)"]], y=sales['Bottles Sold'])

0.80290760852960474

In [14]:
sales.dropna(inplace=True)
X = sales['Item Description']
y = sales['Category Name'].str.contains('WHISKIES')

In [15]:
y.value_counts()

False    198581
True      70677
Name: Category Name, dtype: int64

In [16]:
from sklearn import feature_extraction

In [17]:
cout_vect = feature_extraction.text.CountVectorizer()

In [18]:
X_vect = cout_vect.fit_transform(X)
X_vect

<269258x1595 sparse matrix of type '<type 'numpy.int64'>'
	with 882341 stored elements in Compressed Sparse Row format>

In [19]:
dense = X_vect.todense()

In [20]:
n_zeros = np.sum(dense == 0)
n_zeros, dense.size, n_zeros / float(dense.size)

(428584169, 429466510, 0.99794549521451625)

In [21]:
lm = linear_model.LogisticRegression().fit(X=X_vect, y=y)

In [22]:
lm.score(X=X_vect, y=y)

0.9992200788834501

In [23]:
cross_validation.cross_val_score(linear_model.LogisticRegression(), X=X_vect, y=y, cv=5)

array([ 0.9989787 ,  0.9990901 ,  0.99903437,  0.99907151,  0.99888581])

In [24]:
text_pipeline = pipeline.make_pipeline(feature_extraction.text.CountVectorizer(),
                                       linear_model.LogisticRegression())

In [25]:
cross_validation.cross_val_score(text_pipeline, X=X, y=y, cv=5)

array([ 0.9989787 ,  0.9990901 ,  0.99903437,  0.99907151,  0.99888581])

In [26]:
sales = sales.sort_values(by="Item Description")

In [27]:
sales.dropna(inplace=True)
X = sales['Item Description']
y = sales['Category Name'].str.contains('WHISKIES')

In [28]:
cross_validation.cross_val_score(text_pipeline, X=X, y=y, cv=5)

array([ 0.77811821,  0.87213103,  0.87446844,  0.81589943,  0.90991811])

In [29]:
cv = cross_validation.KFold(len(y), n_folds=5, shuffle=True)
cross_validation.cross_val_score(text_pipeline, X=X, y=y, cv=cv)

array([ 0.9990901 ,  0.9988487 ,  0.99907153,  0.99897866,  0.99881154])

In [33]:
text_pipeline.steps

[('countvectorizer',
  CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
          dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
          lowercase=True, max_df=1.0, max_features=None, min_df=1,
          ngram_range=(1, 1), preprocessor=None, stop_words=None,
          strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
          tokenizer=None, vocabulary=None)),
 ('logisticregression',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False))]

In [36]:
param_grid = {
    'countvectorizer__ngram_range': [(1,1), (1,2), (2,2)],
    'countvectorizer__lowercase': [True, False],
    'logisticregression__fit_intercept': [True, False],
    'logisticregression__penalty': ['l1', 'l2']    
}

gs = grid_search.GridSearchCV(text_pipeline, param_grid=param_grid, cv=3, verbose=1, n_jobs=-1)

In [37]:
gs.fit(X, y)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  2.0min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'countvectorizer__lowercase': [True, False], 'countvectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)], 'logisticregression__penalty': ['l1', 'l2'], 'logisticregression__fit_intercept': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=1)

In [38]:
gs.best_estimator_

Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [39]:
gs.best_params_

{'countvectorizer__lowercase': False,
 'countvectorizer__ngram_range': (1, 2),
 'logisticregression__fit_intercept': False,
 'logisticregression__penalty': 'l2'}

In [40]:
gs.best_score_

0.91066560696432419