In [1]:
# Machine Learning - model training
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.svm import LinearSVC

from sklearn.multiclass import OneVsRestClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

import pickle
import sys
sys.path.append('/Users/wolfsinem/product-tagging')

from product_tagging.tags_generator import tokenized_list

In [2]:
# created dataframe with new tags column 
n = 2000 # number of rows
model_df = tokenized_list()

### Clean data

In [3]:
model_df.dropna(inplace=True)

In [4]:
model_df.shape

(19998, 3)

### Preprocessing

#### MultiLabelBinarizer

In [5]:
model_df = model_df[:n]

In [6]:
target_variable = model_df['tags']

In [7]:
mlb = MultiLabelBinarizer()
target_variable = mlb.fit_transform(target_variable)

In [8]:
mlb.classes_

array(['_brn', 'aa', 'aadivasi', ..., 'zoysia', 'zte', 'zunia'],
      dtype=object)

#### TfidfVectorizer

In [9]:
vectorizer = TfidfVectorizer(strip_accents='unicode', 
                             analyzer='word', 
                             ngram_range=(1,3), 
                             stop_words='english',
                             token_pattern=r'\w{3,}'
                            )


# fit the independent features
independent_variable = vectorizer.fit_transform(model_df['description'])

print('Independent variable shape: {}'.format(independent_variable.shape))
print('Target variable shape: {}'.format(target_variable.shape))

Independent variable shape: (2000, 78536)
Target variable shape: (2000, 2117)


In [10]:
vectorizer.vocabulary_

{'key': 39424,
 'features': 28772,
 'alisha': 6341,
 'solid': 65201,
 'women': 76658,
 'cycling': 21982,
 'shorts': 62869,
 'cotton': 20417,
 'lycra': 43119,
 'navy': 47450,
 'red': 57841,
 'specifications': 65709,
 'details': 23567,
 'number': 48407,
 'contents': 19939,
 'sales': 60358,
 'package': 50395,
 'pack': 50203,
 'fabric': 27896,
 'type': 71913,
 'general': 32477,
 'pattern': 51494,
 'ideal': 36699,
 'care': 14758,
 'gentle': 32726,
 'machine': 43192,
 'wash': 74699,
 'lukewarm': 43059,
 'water': 74954,
 'bleach': 10124,
 'additional': 5752,
 'style': 68266,
 'code': 17359,
 'altht_3p_21': 6544,
 'box': 11357,
 'key features': 39437,
 'features alisha': 28817,
 'alisha solid': 6342,
 'solid women': 65283,
 'women cycling': 76891,
 'cycling shorts': 21986,
 'shorts cotton': 62883,
 'cotton lycra': 20554,
 'lycra navy': 43133,
 'navy red': 47470,
 'red navy': 57970,
 'navy specifications': 47475,
 'specifications alisha': 65755,
 'shorts shorts': 62909,
 'shorts details': 62888

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
                                        independent_variable, 
                                        target_variable, 
                                        test_size=0.2, 
                                        random_state=42, 
                                        )

In [12]:
print('X train shape: {}'.format(X_train.shape))
print('y train shape: {}'.format(y_train.shape))

print('------------------------------')

print('X test shape: {}'.format(X_test.shape))
print('y test shape: {}'.format(y_test.shape))

X train shape: (1600, 78536)
y train shape: (1600, 2117)
------------------------------
X test shape: (400, 78536)
y test shape: (400, 2117)


In [13]:
Linear_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LinearSVC(
                                                class_weight='balanced',
                                                random_state=42,
                                                tol=1e-1,
                                                C=8.385), 
                                                n_jobs=-1)),
            ])

In [14]:
Linear_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('clf',
                 OneVsRestClassifier(estimator=LinearSVC(C=8.385,
                                                         class_weight='balanced',
                                                         dual=True,
                                                         fit_intercept=True,
                                                         intercept_scaling=1,
                                                         loss='squared_hinge',
                                                         max_iter=1000,
                                                         multi_class='ovr',
                                                         penalty='l2',
                                                         random_state=42,
                                                         tol=0.1, verbose=0),
                                     n_jobs=-1))],
         verbose=False)

In [15]:
prediction = Linear_pipeline.predict(X_test)
print('Accuracy for LinearSVC is {}'.format(accuracy_score(y_test, prediction)))

Accuracy for LinearSVC is 0.45


In [16]:
string_1 = ['Key Features of Alisha Solid Womens Cycling Shorts Cotton Lycra Navy, Red, Navy,Specifications of Alisha Solid Womens Cycling Shorts Shorts Details Number of Contents in Sales Package Pack of 3 Fabric Cotton Lycra Type Cycling Shorts General Details Pattern Solid Ideal For Womens Fabric Care Gentle Machine Wash in Lukewarm Water, Do Not Bleach Additional Details Style Code ALTHT_3P_21 In the Box 3 shorts']

In [17]:
model_string_1 = vectorizer.transform(string_1)
Linear_pipeline.predict(model_string_1)

array([[0, 0, 0, ..., 0, 0, 0]])

In [18]:
predicted_tags = mlb.inverse_transform(Linear_pipeline.predict(model_string_1))
predicted_tags

[('alisha', 'cycling', 'shorts', 'solid', 'women')]

### Save model to deploy

In [19]:
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(Linear_pipeline,picklefile)

In [20]:
with open('vect', 'wb') as picklefile:
    pickle.dump(vectorizer,picklefile)

In [21]:
with open('text_classifier', 'rb') as training_model:
    model = pickle.load(training_model)

In [22]:
with open('vect', 'rb') as training_model:
    vect = pickle.load(training_model)

In [23]:
y_pred2 = model.predict(X_test)

In [24]:
print(accuracy_score(y_test, y_pred2)) 

0.45


In [25]:
string_1 = ['Key Features of Alisha Solid Womens Cycling Shorts Cotton Lycra Navy, Red, Navy,Specifications of Alisha Solid Womens Cycling Shorts Shorts Details Number of Contents in Sales Package Pack of 3 Fabric Cotton Lycra Type Cycling Shorts General Details Pattern Solid Ideal For Womens Fabric Care Gentle Machine Wash in Lukewarm Water, Do Not Bleach Additional Details Style Code ALTHT_3P_21 In the Box 3 shorts']

In [26]:
string_1 = vectorizer.transform(string_1).toarray()

In [27]:
label = model.predict(string_1)

In [28]:
mlb.inverse_transform(label)

[('alisha', 'cycling', 'shorts', 'solid', 'women')]