In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd, xgboost as xg, numpy as np, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

Using TensorFlow backend.


In [2]:
data = pd.read_excel("tdata.xlsx")
data.head()

Unnamed: 0,name,category,key_ingredient
0,Mamaearth Epsom Bath Salt for Relaxation and P...,bath salt,epsom
1,"CoCo Body Butter for Dry Skin, with Coffee & C...",body butter,coffee
2,"CoCo Yogurt, with Coffee and Cocoa for Rich Mo...",body butter,coffee
3,"Ubtan Body Butter, For Dry Skin, With Turmeric...",body butter,tumeric
4,Ubtan Yogurt with Turmeric and Saffron for Dee...,body butter,tumeric


In [3]:
cat_data = data[['name','category']]

### Category Classification

In [4]:
cat_data['category'].value_counts()

face wash      30
shampoo        20
face cream     20
face mask      19
conditioner    12
hair oil       11
face serum     10
hair mask      10
body wash       7
facescrub       6
body lotion     6
hair serum      5
body butter     4
face gel        3
hand cream      3
body scrub      1
bath salt       1
Name: category, dtype: int64

In [5]:
# # dropping data points due to very low counts
# cat_data = cat_data[cat_data.groupby('category').category.transform('count')>5]
# cat_data['category'].value_counts()

In [6]:
x_cat = cat_data['name']
y_cat = cat_data['category']

In [7]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(x_cat)

# transform the training and validation data using count vectorizer object
x_cat_count =  count_vect.transform(x_cat)

### Test train split and train model 

In [8]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
y_cat_encoded = encoder.fit_transform(y_cat)

In [9]:
# split the dataset into training and validation datasets 
train_x, test_x, train_y, test_y = model_selection.train_test_split(x_cat_count, y_cat_encoded, train_size=0.8)

In [10]:
# Extereme Gradient Boosting on Count Vectors
cat_classifier = xg.XGBClassifier()
cat_classifier.fit(train_x.tocsc(), train_y)

pred = cat_classifier.predict(test_x.tocsc())
print(classification_report(pred,test_y)) #, target_names=sorted(np.unique(encoder.inverse_transform(test_y)))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       1.00      1.00      1.00         1
           3       0.00      0.00      0.00         1
           4       1.00      1.00      1.00         1
           5       0.00      0.00      0.00         2
           6       0.75      0.75      0.75         4
           7       1.00      1.00      1.00         1
           8       1.00      1.00      1.00         3
           9       1.00      1.00      1.00         2
          10       0.80      0.67      0.73         6
          11       0.00      0.00      0.00         0
          12       1.00      1.00      1.00         2
          13       0.67      0.67      0.67         3
          14       0.00      0.00      0.00         0
          15       1.00      0.50      0.67         2
          16       0.60      0.50      0.55         6

    accuracy              

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
cat_pred_all = cat_classifier.predict(x_cat_count.tocsc())
print(classification_report(cat_pred_all,y_cat_encoded,target_names=sorted(np.unique(y_cat))))

              precision    recall  f1-score   support

   bath salt       0.00      0.00      0.00         0
 body butter       0.75      1.00      0.86         3
 body lotion       1.00      1.00      1.00         6
  body scrub       1.00      0.50      0.67         2
   body wash       1.00      1.00      1.00         7
 conditioner       0.83      0.77      0.80        13
  face cream       0.95      0.90      0.93        21
    face gel       1.00      1.00      1.00         3
   face mask       0.89      1.00      0.94        17
  face serum       0.90      1.00      0.95         9
   face wash       0.93      0.82      0.87        34
   facescrub       0.67      1.00      0.80         4
   hair mask       0.80      0.80      0.80        10
    hair oil       0.73      0.89      0.80         9
  hair serum       0.40      1.00      0.57         2
  hand cream       1.00      0.75      0.86         4
     shampoo       0.85      0.71      0.77        24

    accuracy              

In [12]:
XG_accuracies = cross_val_score(estimator = xg.XGBClassifier(), X = x_cat_count, y = y_cat_encoded, cv = 5)
print("Mean CV Accuracy : ", XG_accuracies.mean())
print(XG_accuracies)



Mean CV Accuracy :  0.7262032085561498
[0.76470588 0.73529412 0.67647059 0.63636364 0.81818182]


#### Due to the small amount of training data, the model is not very robust as seen by the varying cross validation accuracies. This can be improved using a wider range of products from other sources or by tuning and using a model trained for a similar use case.

In [13]:
cat_data['predicted_category'] = encoder.inverse_transform(cat_pred_all)

### Ingredient Classification

In [14]:
ing_data = data[['name','key_ingredient']]
ing_data['key_ingredient'].value_counts()

onion                  26
tumeric                23
vitamin c              16
tea tree               15
coffee                 13
charcoal               10
bhringraj              10
argan                   8
mulberry                5
rice                    5
aloe                    5
retinol                 4
cucumber                4
rose                    4
apple cider vinegar     4
honey                   2
gotu kala               2
neem                    2
niacin                  2
rosehip                 2
carrot                  1
ginger                  1
water                   1
butter                  1
castor                  1
epsom                   1
Name: key_ingredient, dtype: int64

In [15]:
# # dropping data points due to very low counts
# ing_data = ing_data[ing_data.groupby('key_ingredient').key_ingredient.transform('count')>4]
# ing_data['key_ingredient'].value_counts()

In [16]:
x_ing = ing_data['name']
y_ing = ing_data['key_ingredient']

In [17]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(x_ing)

# transform the training and validation data using count vectorizer object
x_ing_count = count_vect.transform(x_ing)

In [18]:
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
y_ing_encoded = encoder.fit_transform(y_ing)

# split the dataset into training and validation datasets 
train_x, test_x, train_y, test_y = model_selection.train_test_split(x_ing_count, y_ing_encoded, train_size=0.8)

In [19]:
# Extereme Gradient Boosting on Count Vectors
ing_classifier = xg.XGBClassifier()
ing_classifier.fit(train_x.tocsc(), train_y)

pred = ing_classifier.predict(test_x.tocsc())
print(classification_report(pred,test_y))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         3
           4       0.00      0.00      0.00         1
           7       1.00      1.00      1.00         3
           8       0.83      1.00      0.91         5
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         1
          17       1.00      1.00      1.00         4
          18       1.00      1.00      1.00         1
          19       1.00      1.00      1.00         1
          20       1.00      1.00      1.00         1
          22       1.00      1.00      1.00         4
          23       1.00      0.80      0.89         5
          24       0.50    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
XG_accuracies = cross_val_score(estimator = xg.XGBClassifier(), X = x_ing_count, y = y_ing_encoded, cv = 5)
print("Mean CV Accuracy : ", XG_accuracies.mean())
print(XG_accuracies)



Mean CV Accuracy :  0.8035650623885917
[0.79411765 0.79411765 0.82352941 0.87878788 0.72727273]


In [21]:
ing_pred_all = ing_classifier.predict(x_ing_count.tocsc())
print(classification_report(ing_pred_all,y_ing_encoded,target_names=sorted(np.unique(y_ing))))

                     precision    recall  f1-score   support

               aloe       1.00      1.00      1.00         5
apple cider vinegar       1.00      1.00      1.00         4
              argan       1.00      1.00      1.00         8
          bhringraj       1.00      1.00      1.00        10
             butter       1.00      0.50      0.67         2
             carrot       1.00      1.00      1.00         1
             castor       1.00      1.00      1.00         1
           charcoal       1.00      1.00      1.00        10
             coffee       0.92      1.00      0.96        12
           cucumber       1.00      1.00      1.00         4
              epsom       1.00      1.00      1.00         1
             ginger       1.00      0.50      0.67         2
          gotu kala       1.00      0.67      0.80         3
              honey       1.00      1.00      1.00         2
           mulberry       0.80      1.00      0.89         4
               neem    

  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
ing_data['predicted_ingredient'] = encoder.inverse_transform(ing_pred_all)

with pd.ExcelWriter('Prediction Comparison.xlsx') as writer:  
    cat_data.to_excel(writer, sheet_name='Category',index=False)
    ing_data.to_excel(writer, sheet_name='Key Ingredient',index=False)