In [None]:
# Machine Learning - model training
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.svm import LinearSVC

from sklearn.multiclass import OneVsRestClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

import pickle
import sys
sys.path.append('/Users/wolfsinem/product-tagging')

from product_tagging.tags_generator import tokenized_list

In [None]:
# created dataframe with new tags column 
n = 2000 # number of rows
model_df = tokenized_list()

### Clean data

In [None]:
model_df.dropna(inplace=True)

In [None]:
model_df.shape

### Preprocessing

#### MultiLabelBinarizer

In [None]:
model_df = model_df[:n]

In [None]:
target_variable = model_df['tags']

In [None]:
mlb = MultiLabelBinarizer()
target_variable = mlb.fit_transform(target_variable)

In [None]:
mlb.classes_

#### TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(strip_accents='unicode', 
                             analyzer='word', 
                             ngram_range=(1,3), 
                             stop_words='english',
                             token_pattern=r'\w{3,}'
                            )


# fit the independent features
independent_variable = vectorizer.fit_transform(model_df['description'])

print('Independent variable shape: {}'.format(independent_variable.shape))
print('Target variable shape: {}'.format(target_variable.shape))

In [None]:
vectorizer.vocabulary_

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
                                        independent_variable, 
                                        target_variable, 
                                        test_size=0.2, 
                                        random_state=42, 
                                        )

In [None]:
print('X train shape: {}'.format(X_train.shape))
print('y train shape: {}'.format(y_train.shape))

print('------------------------------')

print('X test shape: {}'.format(X_test.shape))
print('y test shape: {}'.format(y_test.shape))

In [None]:
Linear_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LinearSVC(
                                                class_weight='balanced',
                                                random_state=42,
                                                tol=1e-1,
                                                C=8.385), 
                                                n_jobs=-1)),
            ])

In [None]:
Linear_pipeline.fit(X_train, y_train)

In [None]:
prediction = Linear_pipeline.predict(X_test)
print('Accuracy for LinearSVC is {}'.format(accuracy_score(y_test, prediction)))

In [None]:
string_1 = ['Key Features of Alisha Solid Womens Cycling Shorts Cotton Lycra Navy, Red, Navy,Specifications of Alisha Solid Womens Cycling Shorts Shorts Details Number of Contents in Sales Package Pack of 3 Fabric Cotton Lycra Type Cycling Shorts General Details Pattern Solid Ideal For Womens Fabric Care Gentle Machine Wash in Lukewarm Water, Do Not Bleach Additional Details Style Code ALTHT_3P_21 In the Box 3 shorts']

In [None]:
model_string_1 = vectorizer.transform(string_1)
Linear_pipeline.predict(model_string_1)

In [None]:
predicted_tags = mlb.inverse_transform(Linear_pipeline.predict(model_string_1))
predicted_tags

### Save model to deploy

In [None]:
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(Linear_pipeline,picklefile)

In [None]:
with open('vect', 'wb') as picklefile:
    pickle.dump(vectorizer,picklefile)

In [None]:
with open('text_classifier', 'rb') as training_model:
    model = pickle.load(training_model)

In [None]:
with open('vect', 'rb') as training_model:
    vect = pickle.load(training_model)

In [None]:
y_pred2 = model.predict(X_test)

In [None]:
print(accuracy_score(y_test, y_pred2)) 

In [None]:
string_1 = ['Key Features of Alisha Solid Womens Cycling Shorts Cotton Lycra Navy, Red, Navy,Specifications of Alisha Solid Womens Cycling Shorts Shorts Details Number of Contents in Sales Package Pack of 3 Fabric Cotton Lycra Type Cycling Shorts General Details Pattern Solid Ideal For Womens Fabric Care Gentle Machine Wash in Lukewarm Water, Do Not Bleach Additional Details Style Code ALTHT_3P_21 In the Box 3 shorts']

In [None]:
string_1 = vectorizer.transform(string_1).toarray()

In [None]:
label = model.predict(string_1)

In [None]:
mlb.inverse_transform(label)