In [2]:
import pandas as pd
import pandas_gbq
import pickle
import random
import isbnlib
from sklearn.utils import resample
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split

## Get data


In [2]:
# Define some helpers
# Resample
def downsample(df, n):
    m_grouped = df.groupby(['target'])
    l_m_grouped = list(m_grouped)
    clean_majority_list = []
    for i in range(len(l_m_grouped)):
        clean_majority_list.append(l_m_grouped[i][1])
    clean_classes = []
    for i in clean_majority_list:
        clean_df = resample(i, replace = True, n_samples = n, random_state=0)
        clean_classes.append(clean_df)
    clean_data = pd.concat(clean_classes)
    return clean_data

# Mask the data to avoid overfitting
def mask_values(data, frac, columns): 
    sample = data.sample(frac=frac, random_state=0)
    data.at[sample.index.values, columns] = ""
    return data

In [3]:
# Read in the data
data = pd.read_pickle('/Users/victoria/notebooks/data-science/product_categorization/development/v3/data/pcc1_v3_data_2018_02_01.pkl')
# Make target variables
data['target'] = data.cat_1
data = data.drop('cat_1', axis=1)

#Mask
data = mask_values(data, 0.64, 'cat_2') 
data = mask_values(data, 0.65, 'cat_3')
data = mask_values(data, 0.53, 'descr') 
data = mask_values(data, 0.59, 'brand')
data = mask_values(data, 0.74, 'cat_4')
data = mask_values(data, 0.86, 'cat_5')
data = mask_values(data, 0.96, 'cat_6')

# Split the data for testing
data, data_test = train_test_split(data, test_size = 0.2, random_state =11, stratify = data.target)

# Resample
data = downsample(data, 30000)

## Make model pipeline

In [5]:
class CleanStrings(TransformerMixin):
    '''Takes a column and replaces NoneType with blank strings'''
    def transform(self, df, **transform_params):
        for col in df.columns:
            df.loc[:, col] = df.loc[:, col].fillna('')
        return df

    def fit(self, df, y=None, **fit_params):
        return self


class Cats2Words (TransformerMixin):
    '''Takes a list of categories, transforms them into words with underscores, and returns a single series'''
    def transform(self, df, **transform_params):
        df['hold_data'] = ''
        for col in df.columns:
            if 'cat' in col:
                df.loc[:, str(col) +'as_word'] = df.loc[:, col].fillna('').map(
                    lambda x: x.replace("&","").replace(",",""))
                df.loc[:, 'hold_data'] = df.loc[:, ['hold_data', str(col) +'as_word']].apply(' '.join, axis = 1) 
        return df

    def fit(self, df, y=None, **fit_params):
        return self
    
class MerchId(TransformerMixin):
    '''Takes a merch_id column and returns new features.'''    
    def transform(self, df, **transform_params):
        df['merch_id_0'] = df.loc[:, 'merch_id'].apply(lambda x: 'id_0_' + x[0])
        df['merch_id_len'] = df.loc[:, 'merch_id'].apply(lambda x: 'id_len_' + str(len(x)))
        df['isbn'] = df.loc[:, 'merch_id'].apply(lambda x: 'isbn_' + str(isbnlib.is_isbn10(x) or isbnlib.is_isbn13(x)))
        return df
    
    def fit(self, df, y=None, **fit_params):
        return self

class ColumnSelector(TransformerMixin):
    '''Takes a dataframe and joins columns called by key.'''
    def __init__(self, key):
        self.key = key
    
    def fit(self, df, y=None):
        return self
    
    def transform(self, df):
        df['key'] = df[self.key].apply(' '.join, axis = 1)
        return df['key']


In [6]:
tfidf_vectorizer = TfidfVectorizer(input='content', encoding='utf-8',
                                   decode_error='strict', strip_accents=ascii, 
                                   lowercase=True, preprocessor=None, 
                                   tokenizer=None, analyzer='word', 
                                   stop_words='english',
                                   ngram_range=(1, 2), max_df=1.0, min_df=1, 
                                   max_features=None, vocabulary=None, 
                                   binary=False, norm='l2', 
                                   use_idf=True, smooth_idf=True, 
                                   sublinear_tf=False)

sgd = SGDClassifier(alpha=0.0000001, n_jobs=-1)

In [7]:
pipeline = Pipeline([
                ('cleanstrings', CleanStrings()),
                ('cats2words', Cats2Words()),
                ('merchId', MerchId()),
                ('column selector', ColumnSelector(key=['title', 'descr','brand', 'hold_data', 'merch_id_0', 'merch_id_len', 'isbn', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6'])),
                ('tfidf', tfidf_vectorizer),
                ('sgd', sgd)
                ])

In [7]:
pipeline.fit_transform(data, data.target)



<870000x1389115 sparse matrix of type '<class 'numpy.float64'>'
	with 51375995 stored elements in Compressed Sparse Column format>

## Test the model

In [8]:
# Predict on no cats
pipeline = Pipeline([
                ('cleanstrings', CleanStrings()),
                ('cats2words', Cats2Words()),
                ('merchId', MerchId()),
                ('column selector', ColumnSelector(key=['title', 'descr','brand', 'hold_data', 'merch_id_0', 'merch_id_len', 'isbn'])),
                ('tfidf', tfidf_vectorizer),
                ('sgd', sgd)
                ])

In [19]:
predictions = pipeline.predict(data_test)

In [20]:
from sklearn.metrics import classification_report
print(classification_report(data_test.target, predictions, target_names=data_test.target.unique().sort()))

                           precision    recall  f1-score   support

             Alexa Skills       0.62      0.38      0.47        34
               Appliances       0.77      0.91      0.84       527
             Apps & Games       0.65      0.63      0.64       114
    Arts, Crafts & Sewing       0.84      0.92      0.88      4376
               Automotive       0.88      0.94      0.91      5272
            Baby Products       0.85      0.94      0.90      2834
   Beauty & Personal Care       0.93      0.96      0.94      9880
                    Books       0.98      0.95      0.96     13858
              CDs & Vinyl       0.44      0.78      0.57       899
Cell Phones & Accessories       0.90      0.96      0.93      7118
Clothing, Shoes & Jewelry       0.96      0.95      0.96     32480
  Collectibles & Fine Art       0.86      0.67      0.75        48
              Electronics       0.96      0.92      0.94     24328
               Gift Cards       0.95      0.98      0.97     

## Check out best features from test

In [14]:
import numpy as np
svc_model = pipeline.named_steps["sgd"]
tf_idf_model = pipeline.named_steps["tfidf"]
categories = data.target.unique()
categories = np.sort(categories)
i = 0
for c in range(len(categories)):
    print(categories[i])
    coefficients = pd.DataFrame({"names":tf_idf_model.get_feature_names(),
                             "coef":svc_model.coef_[i]})
    #print(coefficients.sort_values("coef", ascending=False).head(10))
    i = i + 1

## Get predictions

In [None]:
random = pd.read_csv('~/Downloads/data_labeled.csv')

In [16]:
predictions = pipeline.predict(random)

In [17]:
print(classification_report(random.target, predictions, target_names=random.target.unique().sort()))

                           precision    recall  f1-score   support

    Arts, Crafts & Sewing       0.60      1.00      0.75         3
               Automotive       0.00      0.00      0.00         0
            Baby Products       0.67      0.67      0.67         3
   Beauty & Personal Care       1.00      0.71      0.83        14
                    Books       0.00      0.00      0.00         0
Clothing, Shoes & Jewelry       0.99      0.95      0.97       419
              Electronics       0.42      1.00      0.59         5
               Gift Cards       1.00      0.50      0.67         2
   Grocery & Gourmet Food       1.00      1.00      1.00         2
       Health & Household       0.00      0.00      0.00         0
           Home & Kitchen       0.98      0.83      0.90       132
     Patio, Lawn & Garden       0.00      0.00      0.00         1
             Pet Supplies       0.50      1.00      0.67         2
        Sports & Outdoors       0.05      1.00      0.10     

  'recall', 'true', average, warn_for)


## Output predictions

In [26]:
predictions = pd.DataFrame(predictions)
reset = pd.DataFrame(data_test).reset_index(drop=True)
result = pd.concat([reset.product_id, reset.title, reset.target, predictions], axis =1)